diff --git a/.dockerignore b/.dockerignore
index 9635889..140fab3 100644
--- a/.dockerignore
+++ b/.dockerignore
@@ -1,2 +1,5 @@
data
-tmp
\ No newline at end of file
+tmp
+results
+
+.env
\ No newline at end of file
diff --git a/.env.example b/.env.example
index d4bf83f..2e007b2 100644
--- a/.env.example
+++ b/.env.example
@@ -27,20 +27,27 @@ MOONSHOT_API_KEY=
UNBOUND_ENDPOINT=https://api.getunbound.ai
UNBOUND_API_KEY=
+SiliconFLOW_ENDPOINT=https://api.siliconflow.cn/v1/
+SiliconFLOW_API_KEY=
+
+IBM_ENDPOINT=https://us-south.ml.cloud.ibm.com
+IBM_API_KEY=
+IBM_PROJECT_ID=
+
# Set to false to disable anonymized telemetry
ANONYMIZED_TELEMETRY=false
# LogLevel: Set to debug to enable verbose logging, set to result to get results only. Available: result | debug | info
BROWSER_USE_LOGGING_LEVEL=info
-# Chrome settings
-CHROME_PATH=
-CHROME_USER_DATA=
-CHROME_DEBUGGING_PORT=9222
-CHROME_DEBUGGING_HOST=localhost
+# Browser settings
+BROWSER_PATH=
+BROWSER_USER_DATA=
+BROWSER_DEBUGGING_PORT=9222
+BROWSER_DEBUGGING_HOST=localhost
# Set to true to keep browser open between AI tasks
-CHROME_PERSISTENT_SESSION=false
-CHROME_CDP=
+KEEP_BROWSER_OPEN=true
+BROWSER_CDP=
# Display settings
# Format: WIDTHxHEIGHTxDEPTH
RESOLUTION=1920x1080x24
diff --git a/.gitignore b/.gitignore
index a3f269d..a7a55cd 100644
--- a/.gitignore
+++ b/.gitignore
@@ -187,3 +187,6 @@ data/
# For Config Files (Current Settings)
.config.pkl
+*.pdf
+
+workflow
\ No newline at end of file
diff --git a/Dockerfile b/Dockerfile
index 7b6d39f..19c4b94 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -1,5 +1,9 @@
FROM python:3.11-slim
+# Set platform for multi-arch builds (Docker Buildx will set this)
+ARG TARGETPLATFORM
+ARG NODE_MAJOR=20
+
# Install system dependencies
RUN apt-get update && apt-get install -y \
wget \
@@ -28,7 +32,6 @@ RUN apt-get update && apt-get install -y \
fonts-liberation \
dbus \
xauth \
- xvfb \
x11vnc \
tigervnc-tools \
supervisor \
@@ -40,6 +43,7 @@ RUN apt-get update && apt-get install -y \
fonts-dejavu \
fonts-dejavu-core \
fonts-dejavu-extra \
+ vim \
&& rm -rf /var/lib/apt/lists/*
# Install noVNC
@@ -47,40 +51,50 @@ RUN git clone https://github.com/novnc/noVNC.git /opt/novnc \
&& git clone https://github.com/novnc/websockify /opt/novnc/utils/websockify \
&& ln -s /opt/novnc/vnc.html /opt/novnc/index.html
-# Set platform for ARM64 compatibility
-ARG TARGETPLATFORM=linux/amd64
+# Install Node.js using NodeSource PPA
+RUN mkdir -p /etc/apt/keyrings \
+ && curl -fsSL https://deb.nodesource.com/gpgkey/nodesource-repo.gpg.key | gpg --dearmor -o /etc/apt/keyrings/nodesource.gpg \
+ && echo "deb [signed-by=/etc/apt/keyrings/nodesource.gpg] https://deb.nodesource.com/node_$NODE_MAJOR.x nodistro main" | tee /etc/apt/sources.list.d/nodesource.list \
+ && apt-get update \
+ && apt-get install nodejs -y \
+ && rm -rf /var/lib/apt/lists/*
+
+# Verify Node.js and npm installation (optional, but good for debugging)
+RUN node -v && npm -v && npx -v
# Set up working directory
WORKDIR /app
# Copy requirements and install Python dependencies
COPY requirements.txt .
+# Ensure 'patchright' is in your requirements.txt or install it directly
+# RUN pip install --no-cache-dir -r requirements.txt patchright # If not in requirements
RUN pip install --no-cache-dir -r requirements.txt
-# Install Playwright and browsers with system dependencies
-ENV PLAYWRIGHT_BROWSERS_PATH=/ms-playwright
-RUN playwright install --with-deps chromium
-RUN playwright install-deps
+# Install Patchright browsers and dependencies
+# Patchright documentation suggests PLAYWRIGHT_BROWSERS_PATH is still relevant
+# or that Patchright installs to a similar default location that Playwright would.
+# Let's assume Patchright respects PLAYWRIGHT_BROWSERS_PATH or its default install location is findable.
+ENV PLAYWRIGHT_BROWSERS_PATH=/ms-browsers
+RUN mkdir -p $PLAYWRIGHT_BROWSERS_PATH
+
+# Install recommended: Google Chrome (instead of just Chromium for better undetectability)
+# The 'patchright install chrome' command might download and place it.
+# The '--with-deps' equivalent for patchright install is to run 'patchright install-deps chrome' after.
+# RUN patchright install chrome --with-deps
+
+# Alternative: Install Chromium if Google Chrome is problematic in certain environments
+RUN patchright install chromium --with-deps
+
# Copy the application code
COPY . .
-# Set environment variables
-ENV PYTHONUNBUFFERED=1
-ENV BROWSER_USE_LOGGING_LEVEL=info
-ENV CHROME_PATH=/ms-playwright/chromium-*/chrome-linux/chrome
-ENV ANONYMIZED_TELEMETRY=false
-ENV DISPLAY=:99
-ENV RESOLUTION=1920x1080x24
-ENV VNC_PASSWORD=vncpassword
-ENV CHROME_PERSISTENT_SESSION=true
-ENV RESOLUTION_WIDTH=1920
-ENV RESOLUTION_HEIGHT=1080
-
# Set up supervisor configuration
RUN mkdir -p /var/log/supervisor
COPY supervisord.conf /etc/supervisor/conf.d/supervisord.conf
-EXPOSE 7788 6080 5901
+EXPOSE 7788 6080 5901 9222
CMD ["/usr/bin/supervisord", "-c", "/etc/supervisor/conf.d/supervisord.conf"]
+#CMD ["/bin/bash"]
\ No newline at end of file
diff --git a/Dockerfile.arm64 b/Dockerfile.arm64
deleted file mode 100644
index 6d7a3ff..0000000
--- a/Dockerfile.arm64
+++ /dev/null
@@ -1,85 +0,0 @@
-FROM python:3.11-slim
-
-# Install system dependencies
-RUN apt-get update && apt-get install -y \
- wget \
- gnupg \
- curl \
- unzip \
- xvfb \
- libgconf-2-4 \
- libxss1 \
- libnss3 \
- libnspr4 \
- libasound2 \
- libatk1.0-0 \
- libatk-bridge2.0-0 \
- libcups2 \
- libdbus-1-3 \
- libdrm2 \
- libgbm1 \
- libgtk-3-0 \
- libxcomposite1 \
- libxdamage1 \
- libxfixes3 \
- libxrandr2 \
- xdg-utils \
- fonts-liberation \
- dbus \
- xauth \
- xvfb \
- x11vnc \
- tigervnc-tools \
- supervisor \
- net-tools \
- procps \
- git \
- python3-numpy \
- fontconfig \
- fonts-dejavu \
- fonts-dejavu-core \
- fonts-dejavu-extra \
- && rm -rf /var/lib/apt/lists/*
-
-# Install noVNC
-RUN git clone https://github.com/novnc/noVNC.git /opt/novnc \
- && git clone https://github.com/novnc/websockify /opt/novnc/utils/websockify \
- && ln -s /opt/novnc/vnc.html /opt/novnc/index.html
-
-# Set platform explicitly for ARM64
-ARG TARGETPLATFORM=linux/arm64
-
-# Set up working directory
-WORKDIR /app
-
-# Copy requirements and install Python dependencies
-COPY requirements.txt .
-RUN pip install --no-cache-dir -r requirements.txt
-
-# Install Playwright and browsers with system dependencies optimized for ARM64
-ENV PLAYWRIGHT_BROWSERS_PATH=/ms-playwright
-RUN PLAYWRIGHT_SKIP_BROWSER_DOWNLOAD=1 pip install playwright && \
- playwright install --with-deps chromium
-
-# Copy the application code
-COPY . .
-
-# Set environment variables
-ENV PYTHONUNBUFFERED=1
-ENV BROWSER_USE_LOGGING_LEVEL=info
-ENV CHROME_PATH=/ms-playwright/chromium-*/chrome-linux/chrome
-ENV ANONYMIZED_TELEMETRY=false
-ENV DISPLAY=:99
-ENV RESOLUTION=1920x1080x24
-ENV VNC_PASSWORD=vncpassword
-ENV CHROME_PERSISTENT_SESSION=true
-ENV RESOLUTION_WIDTH=1920
-ENV RESOLUTION_HEIGHT=1080
-
-# Set up supervisor configuration
-RUN mkdir -p /var/log/supervisor
-COPY supervisord.conf /etc/supervisor/conf.d/supervisord.conf
-
-EXPOSE 7788 6080 5901
-
-CMD ["/usr/bin/supervisord", "-c", "/etc/supervisor/conf.d/supervisord.conf"]
\ No newline at end of file
diff --git a/README.md b/README.md
index 9fd442f..b67a2ed 100644
--- a/README.md
+++ b/README.md
@@ -23,10 +23,6 @@ We would like to officially thank [WarmShao](https://github.com/warmshao) for hi
## Installation Guide
-### Prerequisites
-- Python 3.11 or higher
-- Git (for cloning the repository)
-
### Option 1: Local Installation
Read the [quickstart guide](https://docs.browser-use.com/quickstart#prepare-the-environment) or follow the steps below to get started.
@@ -65,15 +61,13 @@ Install Python packages:
uv pip install -r requirements.txt
```
-Install Browsers in Playwright:
-You can install specific browsers by running:
+Install Browsers in Patchright.
```bash
-playwright install --with-deps chromium
+patchright install --with-deps
```
-
-To install all browsers:
+Or you can install specific browsers by running:
```bash
-playwright install
+patchright install chromium --with-deps
```
#### Step 4: Configure Environment
@@ -88,6 +82,29 @@ cp .env.example .env
```
2. Open `.env` in your preferred text editor and add your API keys and other settings
+#### Step 5: Enjoy the web-ui
+1. **Run the WebUI:**
+ ```bash
+ python webui.py --ip 127.0.0.1 --port 7788
+ ```
+2. **Access the WebUI:** Open your web browser and navigate to `http://127.0.0.1:7788`.
+3. **Using Your Own Browser(Optional):**
+ - Set `BROWSER_PATH` to the executable path of your browser and `BROWSER_USER_DATA` to the user data directory of your browser. Leave `BROWSER_USER_DATA` empty if you want to use local user data.
+ - Windows
+ ```env
+ BROWSER_PATH="C:\Program Files\Google\Chrome\Application\chrome.exe"
+ BROWSER_USER_DATA="C:\Users\YourUsername\AppData\Local\Google\Chrome\User Data"
+ ```
+ > Note: Replace `YourUsername` with your actual Windows username for Windows systems.
+ - Mac
+ ```env
+ BROWSER_PATH="/Applications/Google Chrome.app/Contents/MacOS/Google Chrome"
+ BROWSER_USER_DATA="/Users/YourUsername/Library/Application Support/Google/Chrome"
+ ```
+ - Close all Chrome windows
+ - Open the WebUI in a non-Chrome browser, such as Firefox or Edge. This is important because the persistent browser context will use the Chrome data when running the agent.
+ - Check the "Use Own Browser" option within the Browser Settings.
+
### Option 2: Docker Installation
#### Prerequisites
@@ -95,14 +112,14 @@ cp .env.example .env
- [Docker Desktop](https://www.docker.com/products/docker-desktop/) (For Windows/macOS)
- [Docker Engine](https://docs.docker.com/engine/install/) and [Docker Compose](https://docs.docker.com/compose/install/) (For Linux)
-#### Installation Steps
-1. Clone the repository:
+#### Step 1: Clone the Repository
```bash
git clone https://github.com/browser-use/web-ui.git
cd web-ui
```
-2. Create and configure environment file:
+#### Step 2: Configure Environment
+1. Create a copy of the example environment file:
- Windows (Command Prompt):
```bash
copy .env.example .env
@@ -111,127 +128,23 @@ copy .env.example .env
```bash
cp .env.example .env
```
-Edit `.env` with your preferred text editor and add your API keys
+2. Open `.env` in your preferred text editor and add your API keys and other settings
-3. Run with Docker:
+#### Step 3: Docker Build and Run
```bash
-# Build and start the container with default settings (browser closes after AI tasks)
docker compose up --build
```
+For ARM64 systems (e.g., Apple Silicon Macs), please run follow command:
```bash
-# Or run with persistent browser (browser stays open between AI tasks)
-CHROME_PERSISTENT_SESSION=true docker compose up --build
+TARGETPLATFORM=linux/arm64 docker compose up --build
```
-
-4. Access the Application:
-- Web Interface: Open `http://localhost:7788` in your browser
+#### Step 4: Enjoy the web-ui and vnc
+- Web-UI: Open `http://localhost:7788` in your browser
- VNC Viewer (for watching browser interactions): Open `http://localhost:6080/vnc.html`
- Default VNC password: "youvncpassword"
- Can be changed by setting `VNC_PASSWORD` in your `.env` file
-## Usage
-
-### Local Setup
-1. **Run the WebUI:**
- After completing the installation steps above, start the application:
- ```bash
- python webui.py --ip 127.0.0.1 --port 7788
- ```
-2. WebUI options:
- - `--ip`: The IP address to bind the WebUI to. Default is `127.0.0.1`.
- - `--port`: The port to bind the WebUI to. Default is `7788`.
- - `--theme`: The theme for the user interface. Default is `Ocean`.
- - **Default**: The standard theme with a balanced design.
- - **Soft**: A gentle, muted color scheme for a relaxed viewing experience.
- - **Monochrome**: A grayscale theme with minimal color for simplicity and focus.
- - **Glass**: A sleek, semi-transparent design for a modern appearance.
- - **Origin**: A classic, retro-inspired theme for a nostalgic feel.
- - **Citrus**: A vibrant, citrus-inspired palette with bright and fresh colors.
- - **Ocean** (default): A blue, ocean-inspired theme providing a calming effect.
- - `--dark-mode`: Enables dark mode for the user interface.
-3. **Access the WebUI:** Open your web browser and navigate to `http://127.0.0.1:7788`.
-4. **Using Your Own Browser(Optional):**
- - Set `CHROME_PATH` to the executable path of your browser and `CHROME_USER_DATA` to the user data directory of your browser. Leave `CHROME_USER_DATA` empty if you want to use local user data.
- - Windows
- ```env
- CHROME_PATH="C:\Program Files\Google\Chrome\Application\chrome.exe"
- CHROME_USER_DATA="C:\Users\YourUsername\AppData\Local\Google\Chrome\User Data"
- ```
- > Note: Replace `YourUsername` with your actual Windows username for Windows systems.
- - Mac
- ```env
- CHROME_PATH="/Applications/Google Chrome.app/Contents/MacOS/Google Chrome"
- CHROME_USER_DATA="/Users/YourUsername/Library/Application Support/Google/Chrome"
- ```
- - Close all Chrome windows
- - Open the WebUI in a non-Chrome browser, such as Firefox or Edge. This is important because the persistent browser context will use the Chrome data when running the agent.
- - Check the "Use Own Browser" option within the Browser Settings.
-5. **Keep Browser Open(Optional):**
- - Set `CHROME_PERSISTENT_SESSION=true` in the `.env` file.
-
-### Docker Setup
-1. **Environment Variables:**
- - All configuration is done through the `.env` file
- - Available environment variables:
- ```
- # LLM API Keys
- OPENAI_API_KEY=your_key_here
- ANTHROPIC_API_KEY=your_key_here
- GOOGLE_API_KEY=your_key_here
-
- # Browser Settings
- CHROME_PERSISTENT_SESSION=true # Set to true to keep browser open between AI tasks
- RESOLUTION=1920x1080x24 # Custom resolution format: WIDTHxHEIGHTxDEPTH
- RESOLUTION_WIDTH=1920 # Custom width in pixels
- RESOLUTION_HEIGHT=1080 # Custom height in pixels
-
- # VNC Settings
- VNC_PASSWORD=your_vnc_password # Optional, defaults to "vncpassword"
- ```
-
-2. **Platform Support:**
- - Supports both AMD64 and ARM64 architectures
- - For ARM64 systems (e.g., Apple Silicon Macs), the container will automatically use the appropriate image
-
-3. **Browser Persistence Modes:**
- - **Default Mode (CHROME_PERSISTENT_SESSION=false):**
- - Browser opens and closes with each AI task
- - Clean state for each interaction
- - Lower resource usage
-
- - **Persistent Mode (CHROME_PERSISTENT_SESSION=true):**
- - Browser stays open between AI tasks
- - Maintains history and state
- - Allows viewing previous AI interactions
- - Set in `.env` file or via environment variable when starting container
-
-4. **Viewing Browser Interactions:**
- - Access the noVNC viewer at `http://localhost:6080/vnc.html`
- - Enter the VNC password (default: "vncpassword" or what you set in VNC_PASSWORD)
- - Direct VNC access available on port 5900 (mapped to container port 5901)
- - You can now see all browser interactions in real-time
-
-5. **Container Management:**
- ```bash
- # Start with persistent browser
- CHROME_PERSISTENT_SESSION=true docker compose up -d
-
- # Start with default mode (browser closes after tasks)
- docker compose up -d
-
- # View logs
- docker compose logs -f
-
- # Stop the container
- docker compose down
- ```
-
-6. **Using precompiled image**
- ```bash
- docker pull ghcr.io/browser-use/web-ui
- ```
-
## Changelog
- [x] **2025/01/26:** Thanks to @vvincent1234. Now browser-use-webui can combine with DeepSeek-r1 to engage in deep thinking!
- [x] **2025/01/10:** Thanks to @casistack. Now we have Docker Setup option and also Support keep browser open between tasks.[Video tutorial demo](https://github.com/browser-use/web-ui/issues/1#issuecomment-2582511750).
diff --git a/docker-compose.yml b/docker-compose.yml
index a00a4d3..b5051cb 100644
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -1,59 +1,80 @@
services:
+ # debug: docker compose run --rm -it browser-use-webui bash
browser-use-webui:
# image: ghcr.io/browser-use/web-ui # Using precompiled image
build:
context: .
- dockerfile: ${DOCKERFILE:-Dockerfile}
+ dockerfile: Dockerfile
args:
TARGETPLATFORM: ${TARGETPLATFORM:-linux/amd64}
ports:
- - "7788:7788" # Gradio default port
- - "6080:6080" # noVNC web interface
- - "5901:5901" # VNC port
- - "9222:9222" # Chrome remote debugging port
+ - "7788:7788"
+ - "6080:6080"
+ - "5901:5901"
+ - "9222:9222"
environment:
+ # LLM API Keys & Endpoints
- OPENAI_ENDPOINT=${OPENAI_ENDPOINT:-https://api.openai.com/v1}
- OPENAI_API_KEY=${OPENAI_API_KEY:-}
- - ANTHROPIC_API_KEY=${ANTHROPIC_API_KEY:-}
- ANTHROPIC_ENDPOINT=${ANTHROPIC_ENDPOINT:-https://api.anthropic.com}
+ - ANTHROPIC_API_KEY=${ANTHROPIC_API_KEY:-}
- GOOGLE_API_KEY=${GOOGLE_API_KEY:-}
- AZURE_OPENAI_ENDPOINT=${AZURE_OPENAI_ENDPOINT:-}
- AZURE_OPENAI_API_KEY=${AZURE_OPENAI_API_KEY:-}
+ - AZURE_OPENAI_API_VERSION=${AZURE_OPENAI_API_VERSION:-2025-01-01-preview}
- DEEPSEEK_ENDPOINT=${DEEPSEEK_ENDPOINT:-https://api.deepseek.com}
- DEEPSEEK_API_KEY=${DEEPSEEK_API_KEY:-}
- OLLAMA_ENDPOINT=${OLLAMA_ENDPOINT:-http://localhost:11434}
- - MISTRAL_API_KEY=${MISTRAL_API_KEY:-}
- MISTRAL_ENDPOINT=${MISTRAL_ENDPOINT:-https://api.mistral.ai/v1}
+ - MISTRAL_API_KEY=${MISTRAL_API_KEY:-}
- ALIBABA_ENDPOINT=${ALIBABA_ENDPOINT:-https://dashscope.aliyuncs.com/compatible-mode/v1}
- ALIBABA_API_KEY=${ALIBABA_API_KEY:-}
- MOONSHOT_ENDPOINT=${MOONSHOT_ENDPOINT:-https://api.moonshot.cn/v1}
- MOONSHOT_API_KEY=${MOONSHOT_API_KEY:-}
- - BROWSER_USE_LOGGING_LEVEL=${BROWSER_USE_LOGGING_LEVEL:-info}
+ - UNBOUND_ENDPOINT=${UNBOUND_ENDPOINT:-https://api.getunbound.ai}
+ - UNBOUND_API_KEY=${UNBOUND_API_KEY:-}
+ - SiliconFLOW_ENDPOINT=${SiliconFLOW_ENDPOINT:-https://api.siliconflow.cn/v1/}
+ - SiliconFLOW_API_KEY=${SiliconFLOW_API_KEY:-}
+ - IBM_ENDPOINT=${IBM_ENDPOINT:-https://us-south.ml.cloud.ibm.com}
+ - IBM_API_KEY=${IBM_API_KEY:-}
+ - IBM_PROJECT_ID=${IBM_PROJECT_ID:-}
+
+ # Application Settings
- ANONYMIZED_TELEMETRY=${ANONYMIZED_TELEMETRY:-false}
- - CHROME_PATH=/usr/bin/google-chrome
- - CHROME_USER_DATA=/app/data/chrome_data
- - CHROME_PERSISTENT_SESSION=${CHROME_PERSISTENT_SESSION:-false}
- - CHROME_CDP=${CHROME_CDP:-http://localhost:9222}
+ - BROWSER_USE_LOGGING_LEVEL=${BROWSER_USE_LOGGING_LEVEL:-info}
+
+ # Browser Settings
+ - BROWSER_PATH=
+ - BROWSER_USER_DATA=
+ - BROWSER_DEBUGGING_PORT=${BROWSER_DEBUGGING_PORT:-9222}
+ - BROWSER_DEBUGGING_HOST=localhost
+ - USE_OWN_BROWSER=false
+ - KEEP_BROWSER_OPEN=true
+ - BROWSER_CDP=${BROWSER_CDP:-} # e.g., http://localhost:9222
+
+ # Display Settings
- DISPLAY=:99
- - PLAYWRIGHT_BROWSERS_PATH=/ms-playwright
+ # This ENV is used by the Dockerfile during build time if Patchright respects it.
+ # It's not strictly needed at runtime by docker-compose unless your app or scripts also read it.
+ - PLAYWRIGHT_BROWSERS_PATH=/ms-browsers # Matches Dockerfile ENV
- RESOLUTION=${RESOLUTION:-1920x1080x24}
- RESOLUTION_WIDTH=${RESOLUTION_WIDTH:-1920}
- RESOLUTION_HEIGHT=${RESOLUTION_HEIGHT:-1080}
- - VNC_PASSWORD=${VNC_PASSWORD:-vncpassword}
- - CHROME_DEBUGGING_PORT=9222
- - CHROME_DEBUGGING_HOST=localhost
+
+ # VNC Settings
+ - VNC_PASSWORD=${VNC_PASSWORD:-youvncpassword}
+
volumes:
- /tmp/.X11-unix:/tmp/.X11-unix
+ # - ./my_chrome_data:/app/data/chrome_data # Optional: persist browser data
restart: unless-stopped
shm_size: '2gb'
cap_add:
- SYS_ADMIN
- security_opt:
- - seccomp=unconfined
tmpfs:
- /tmp
healthcheck:
- test: ["CMD", "nc", "-z", "localhost", "5901"]
+ test: ["CMD", "nc", "-z", "localhost", "5901"] # VNC port
interval: 10s
timeout: 5s
- retries: 3
+ retries: 3
\ No newline at end of file
diff --git a/entrypoint.sh b/entrypoint.sh
deleted file mode 100644
index 9ab9240..0000000
--- a/entrypoint.sh
+++ /dev/null
@@ -1,4 +0,0 @@
-#!/bin/bash
-
-# Start supervisord in the foreground to properly manage child processes
-exec /usr/bin/supervisord -n -c /etc/supervisor/conf.d/supervisord.conf
\ No newline at end of file
diff --git a/requirements.txt b/requirements.txt
index 7f2d12c..bc8de8c 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,7 +1,10 @@
-browser-use==0.1.40
+browser-use==0.1.45
pyperclip==1.9.0
-gradio==5.23.1
+gradio==5.27.0
json-repair
langchain-mistralai==0.2.4
-langchain-google-genai==2.0.8
MainContentExtractor==0.0.4
+langchain-ibm==0.3.10
+langchain_mcp_adapters==0.0.9
+langgraph==0.3.34
+langchain-community
diff --git a/src/agent/browser_use/browser_use_agent.py b/src/agent/browser_use/browser_use_agent.py
new file mode 100644
index 0000000..d5cba0f
--- /dev/null
+++ b/src/agent/browser_use/browser_use_agent.py
@@ -0,0 +1,185 @@
+from __future__ import annotations
+
+import asyncio
+import logging
+import os
+
+# from lmnr.sdk.decorators import observe
+from browser_use.agent.gif import create_history_gif
+from browser_use.agent.service import Agent, AgentHookFunc
+from browser_use.agent.views import (
+ ActionResult,
+ AgentHistory,
+ AgentHistoryList,
+ AgentStepInfo,
+ ToolCallingMethod,
+)
+from browser_use.browser.views import BrowserStateHistory
+from browser_use.telemetry.views import (
+ AgentEndTelemetryEvent,
+)
+from browser_use.utils import time_execution_async
+from dotenv import load_dotenv
+from browser_use.agent.message_manager.utils import is_model_without_tool_support
+
+load_dotenv()
+logger = logging.getLogger(__name__)
+
+SKIP_LLM_API_KEY_VERIFICATION = (
+ os.environ.get("SKIP_LLM_API_KEY_VERIFICATION", "false").lower()[0] in "ty1"
+)
+
+
+class BrowserUseAgent(Agent):
+ def _set_tool_calling_method(self) -> ToolCallingMethod | None:
+ tool_calling_method = self.settings.tool_calling_method
+ if tool_calling_method == 'auto':
+ if is_model_without_tool_support(self.model_name):
+ return 'raw'
+ elif self.chat_model_library == 'ChatGoogleGenerativeAI':
+ return None
+ elif self.chat_model_library == 'ChatOpenAI':
+ return 'function_calling'
+ elif self.chat_model_library == 'AzureChatOpenAI':
+ return 'function_calling'
+ else:
+ return None
+ else:
+ return tool_calling_method
+
+ @time_execution_async("--run (agent)")
+ async def run(
+ self, max_steps: int = 100, on_step_start: AgentHookFunc | None = None,
+ on_step_end: AgentHookFunc | None = None
+ ) -> AgentHistoryList:
+ """Execute the task with maximum number of steps"""
+
+ loop = asyncio.get_event_loop()
+
+ # Set up the Ctrl+C signal handler with callbacks specific to this agent
+ from browser_use.utils import SignalHandler
+
+ signal_handler = SignalHandler(
+ loop=loop,
+ pause_callback=self.pause,
+ resume_callback=self.resume,
+ custom_exit_callback=None, # No special cleanup needed on forced exit
+ exit_on_second_int=True,
+ )
+ signal_handler.register()
+
+ try:
+ self._log_agent_run()
+
+ # Execute initial actions if provided
+ if self.initial_actions:
+ result = await self.multi_act(self.initial_actions, check_for_new_elements=False)
+ self.state.last_result = result
+
+ for step in range(max_steps):
+ # Check if waiting for user input after Ctrl+C
+ if self.state.paused:
+ signal_handler.wait_for_resume()
+ signal_handler.reset()
+
+ # Check if we should stop due to too many failures
+ if self.state.consecutive_failures >= self.settings.max_failures:
+ logger.error(f'ā Stopping due to {self.settings.max_failures} consecutive failures')
+ break
+
+ # Check control flags before each step
+ if self.state.stopped:
+ logger.info('Agent stopped')
+ break
+
+ while self.state.paused:
+ await asyncio.sleep(0.2) # Small delay to prevent CPU spinning
+ if self.state.stopped: # Allow stopping while paused
+ break
+
+ if on_step_start is not None:
+ await on_step_start(self)
+
+ step_info = AgentStepInfo(step_number=step, max_steps=max_steps)
+ await self.step(step_info)
+
+ if on_step_end is not None:
+ await on_step_end(self)
+
+ if self.state.history.is_done():
+ if self.settings.validate_output and step < max_steps - 1:
+ if not await self._validate_output():
+ continue
+
+ await self.log_completion()
+ break
+ else:
+ error_message = 'Failed to complete task in maximum steps'
+
+ self.state.history.history.append(
+ AgentHistory(
+ model_output=None,
+ result=[ActionResult(error=error_message, include_in_memory=True)],
+ state=BrowserStateHistory(
+ url='',
+ title='',
+ tabs=[],
+ interacted_element=[],
+ screenshot=None,
+ ),
+ metadata=None,
+ )
+ )
+
+ logger.info(f'ā {error_message}')
+
+ return self.state.history
+
+ except KeyboardInterrupt:
+ # Already handled by our signal handler, but catch any direct KeyboardInterrupt as well
+ logger.info('Got KeyboardInterrupt during execution, returning current history')
+ return self.state.history
+
+ finally:
+ # Unregister signal handlers before cleanup
+ signal_handler.unregister()
+
+ self.telemetry.capture(
+ AgentEndTelemetryEvent(
+ agent_id=self.state.agent_id,
+ is_done=self.state.history.is_done(),
+ success=self.state.history.is_successful(),
+ steps=self.state.n_steps,
+ max_steps_reached=self.state.n_steps >= max_steps,
+ errors=self.state.history.errors(),
+ total_input_tokens=self.state.history.total_input_tokens(),
+ total_duration_seconds=self.state.history.total_duration_seconds(),
+ )
+ )
+
+ if self.settings.save_playwright_script_path:
+ logger.info(
+ f'Agent run finished. Attempting to save Playwright script to: {self.settings.save_playwright_script_path}'
+ )
+ try:
+ # Extract sensitive data keys if sensitive_data is provided
+ keys = list(self.sensitive_data.keys()) if self.sensitive_data else None
+ # Pass browser and context config to the saving method
+ self.state.history.save_as_playwright_script(
+ self.settings.save_playwright_script_path,
+ sensitive_data_keys=keys,
+ browser_config=self.browser.config,
+ context_config=self.browser_context.config,
+ )
+ except Exception as script_gen_err:
+ # Log any error during script generation/saving
+ logger.error(f'Failed to save Playwright script: {script_gen_err}', exc_info=True)
+
+ await self.close()
+
+ if self.settings.generate_gif:
+ output_path: str = 'agent_history.gif'
+ if isinstance(self.settings.generate_gif, str):
+ output_path = self.settings.generate_gif
+
+ create_history_gif(task=self.task, history=self.state.history, output_path=output_path)
diff --git a/src/agent/custom_agent.py b/src/agent/custom_agent.py
deleted file mode 100644
index 4b0eff3..0000000
--- a/src/agent/custom_agent.py
+++ /dev/null
@@ -1,478 +0,0 @@
-import json
-import logging
-import pdb
-import traceback
-from typing import Any, Awaitable, Callable, Dict, Generic, List, Optional, Type, TypeVar
-from PIL import Image, ImageDraw, ImageFont
-import os
-import base64
-import io
-import asyncio
-import time
-import platform
-from browser_use.agent.prompts import SystemPrompt, AgentMessagePrompt
-from browser_use.agent.service import Agent
-from browser_use.agent.message_manager.utils import convert_input_messages, extract_json_from_model_output, \
- save_conversation
-from browser_use.agent.views import (
- ActionResult,
- AgentError,
- AgentHistory,
- AgentHistoryList,
- AgentOutput,
- AgentSettings,
- AgentState,
- AgentStepInfo,
- StepMetadata,
- ToolCallingMethod,
-)
-from browser_use.agent.gif import create_history_gif
-from browser_use.browser.browser import Browser
-from browser_use.browser.context import BrowserContext
-from browser_use.browser.views import BrowserStateHistory
-from browser_use.controller.service import Controller
-from browser_use.telemetry.views import (
- AgentEndTelemetryEvent,
- AgentRunTelemetryEvent,
- AgentStepTelemetryEvent,
-)
-from browser_use.utils import time_execution_async
-from langchain_core.language_models.chat_models import BaseChatModel
-from langchain_core.messages import (
- BaseMessage,
- HumanMessage,
- AIMessage
-)
-from browser_use.browser.views import BrowserState, BrowserStateHistory
-from browser_use.agent.prompts import PlannerPrompt
-
-from json_repair import repair_json
-from src.utils.agent_state import AgentState
-
-from .custom_message_manager import CustomMessageManager, CustomMessageManagerSettings
-from .custom_views import CustomAgentOutput, CustomAgentStepInfo, CustomAgentState
-
-logger = logging.getLogger(__name__)
-
-Context = TypeVar('Context')
-
-
-class CustomAgent(Agent):
- def __init__(
- self,
- task: str,
- llm: BaseChatModel,
- add_infos: str = "",
- # Optional parameters
- browser: Browser | None = None,
- browser_context: BrowserContext | None = None,
- controller: Controller[Context] = Controller(),
- # Initial agent run parameters
- sensitive_data: Optional[Dict[str, str]] = None,
- initial_actions: Optional[List[Dict[str, Dict[str, Any]]]] = None,
- # Cloud Callbacks
- register_new_step_callback: Callable[['BrowserState', 'AgentOutput', int], Awaitable[None]] | None = None,
- register_done_callback: Callable[['AgentHistoryList'], Awaitable[None]] | None = None,
- register_external_agent_status_raise_error_callback: Callable[[], Awaitable[bool]] | None = None,
- # Agent settings
- use_vision: bool = True,
- use_vision_for_planner: bool = False,
- save_conversation_path: Optional[str] = None,
- save_conversation_path_encoding: Optional[str] = 'utf-8',
- max_failures: int = 3,
- retry_delay: int = 10,
- system_prompt_class: Type[SystemPrompt] = SystemPrompt,
- agent_prompt_class: Type[AgentMessagePrompt] = AgentMessagePrompt,
- max_input_tokens: int = 128000,
- validate_output: bool = False,
- message_context: Optional[str] = None,
- generate_gif: bool | str = False,
- available_file_paths: Optional[list[str]] = None,
- include_attributes: list[str] = [
- 'title',
- 'type',
- 'name',
- 'role',
- 'aria-label',
- 'placeholder',
- 'value',
- 'alt',
- 'aria-expanded',
- 'data-date-format',
- ],
- max_actions_per_step: int = 10,
- tool_calling_method: Optional[ToolCallingMethod] = 'auto',
- page_extraction_llm: Optional[BaseChatModel] = None,
- planner_llm: Optional[BaseChatModel] = None,
- planner_interval: int = 1, # Run planner every N steps
- # Inject state
- injected_agent_state: Optional[AgentState] = None,
- context: Context | None = None,
- ):
- super(CustomAgent, self).__init__(
- task=task,
- llm=llm,
- browser=browser,
- browser_context=browser_context,
- controller=controller,
- sensitive_data=sensitive_data,
- initial_actions=initial_actions,
- register_new_step_callback=register_new_step_callback,
- register_done_callback=register_done_callback,
- register_external_agent_status_raise_error_callback=register_external_agent_status_raise_error_callback,
- use_vision=use_vision,
- use_vision_for_planner=use_vision_for_planner,
- save_conversation_path=save_conversation_path,
- save_conversation_path_encoding=save_conversation_path_encoding,
- max_failures=max_failures,
- retry_delay=retry_delay,
- system_prompt_class=system_prompt_class,
- max_input_tokens=max_input_tokens,
- validate_output=validate_output,
- message_context=message_context,
- generate_gif=generate_gif,
- available_file_paths=available_file_paths,
- include_attributes=include_attributes,
- max_actions_per_step=max_actions_per_step,
- tool_calling_method=tool_calling_method,
- page_extraction_llm=page_extraction_llm,
- planner_llm=planner_llm,
- planner_interval=planner_interval,
- injected_agent_state=injected_agent_state,
- context=context,
- )
- self.state = injected_agent_state or CustomAgentState()
- self.add_infos = add_infos
- self._message_manager = CustomMessageManager(
- task=task,
- system_message=self.settings.system_prompt_class(
- self.available_actions,
- max_actions_per_step=self.settings.max_actions_per_step,
- ).get_system_message(),
- settings=CustomMessageManagerSettings(
- max_input_tokens=self.settings.max_input_tokens,
- include_attributes=self.settings.include_attributes,
- message_context=self.settings.message_context,
- sensitive_data=sensitive_data,
- available_file_paths=self.settings.available_file_paths,
- agent_prompt_class=agent_prompt_class
- ),
- state=self.state.message_manager_state,
- )
-
- def _log_response(self, response: CustomAgentOutput) -> None:
- """Log the model's response"""
- if "Success" in response.current_state.evaluation_previous_goal:
- emoji = "ā
"
- elif "Failed" in response.current_state.evaluation_previous_goal:
- emoji = "ā"
- else:
- emoji = "š¤·"
-
- logger.info(f"{emoji} Eval: {response.current_state.evaluation_previous_goal}")
- logger.info(f"š§ New Memory: {response.current_state.important_contents}")
- logger.info(f"š¤ Thought: {response.current_state.thought}")
- logger.info(f"šÆ Next Goal: {response.current_state.next_goal}")
- for i, action in enumerate(response.action):
- logger.info(
- f"š ļø Action {i + 1}/{len(response.action)}: {action.model_dump_json(exclude_unset=True)}"
- )
-
- def _setup_action_models(self) -> None:
- """Setup dynamic action models from controller's registry"""
- # Get the dynamic action model from controller's registry
- self.ActionModel = self.controller.registry.create_action_model()
- # Create output model with the dynamic actions
- self.AgentOutput = CustomAgentOutput.type_with_custom_actions(self.ActionModel)
-
- def update_step_info(
- self, model_output: CustomAgentOutput, step_info: CustomAgentStepInfo = None
- ):
- """
- update step info
- """
- if step_info is None:
- return
-
- step_info.step_number += 1
- important_contents = model_output.current_state.important_contents
- if (
- important_contents
- and "None" not in important_contents
- and important_contents not in step_info.memory
- ):
- step_info.memory += important_contents + "\n"
-
- logger.info(f"š§ All Memory: \n{step_info.memory}")
-
- @time_execution_async("--get_next_action")
- async def get_next_action(self, input_messages: list[BaseMessage]) -> AgentOutput:
- """Get next action from LLM based on current state"""
- fixed_input_messages = self._convert_input_messages(input_messages)
- ai_message = self.llm.invoke(fixed_input_messages)
- self.message_manager._add_message_with_tokens(ai_message)
-
- if hasattr(ai_message, "reasoning_content"):
- logger.info("𤯠Start Deep Thinking: ")
- logger.info(ai_message.reasoning_content)
- logger.info("𤯠End Deep Thinking")
-
- if isinstance(ai_message.content, list):
- ai_content = ai_message.content[0]
- else:
- ai_content = ai_message.content
-
- try:
- ai_content = ai_content.replace("```json", "").replace("```", "")
- ai_content = repair_json(ai_content)
- parsed_json = json.loads(ai_content)
- parsed: AgentOutput = self.AgentOutput(**parsed_json)
- except Exception as e:
- import traceback
- traceback.print_exc()
- logger.debug(ai_message.content)
- raise ValueError('Could not parse response.')
-
- if parsed is None:
- logger.debug(ai_message.content)
- raise ValueError('Could not parse response.')
-
- # cut the number of actions to max_actions_per_step if needed
- if len(parsed.action) > self.settings.max_actions_per_step:
- parsed.action = parsed.action[: self.settings.max_actions_per_step]
- self._log_response(parsed)
- return parsed
-
- async def _run_planner(self) -> Optional[str]:
- """Run the planner to analyze state and suggest next steps"""
- # Skip planning if no planner_llm is set
- if not self.settings.planner_llm:
- return None
-
- # Create planner message history using full message history
- planner_messages = [
- PlannerPrompt(self.controller.registry.get_prompt_description()).get_system_message(),
- *self.message_manager.get_messages()[1:], # Use full message history except the first
- ]
-
- if not self.settings.use_vision_for_planner and self.settings.use_vision:
- last_state_message: HumanMessage = planner_messages[-1]
- # remove image from last state message
- new_msg = ''
- if isinstance(last_state_message.content, list):
- for msg in last_state_message.content:
- if msg['type'] == 'text':
- new_msg += msg['text']
- elif msg['type'] == 'image_url':
- continue
- else:
- new_msg = last_state_message.content
-
- planner_messages[-1] = HumanMessage(content=new_msg)
-
- # Get planner output
- response = await self.settings.planner_llm.ainvoke(planner_messages)
- plan = str(response.content)
- last_state_message = self.message_manager.get_messages()[-1]
- if isinstance(last_state_message, HumanMessage):
- # remove image from last state message
- if isinstance(last_state_message.content, list):
- for msg in last_state_message.content:
- if msg['type'] == 'text':
- msg['text'] += f"\nPlanning Agent outputs plans:\n {plan}\n"
- else:
- last_state_message.content += f"\nPlanning Agent outputs plans:\n {plan}\n "
-
- try:
- plan_json = json.loads(plan.replace("```json", "").replace("```", ""))
- logger.info(f'š Plans:\n{json.dumps(plan_json, indent=4)}')
-
- if hasattr(response, "reasoning_content"):
- logger.info("𤯠Start Planning Deep Thinking: ")
- logger.info(response.reasoning_content)
- logger.info("𤯠End Planning Deep Thinking")
-
- except json.JSONDecodeError:
- logger.info(f'š Plans:\n{plan}')
- except Exception as e:
- logger.debug(f'Error parsing planning analysis: {e}')
- logger.info(f'š Plans: {plan}')
- return plan
-
- @time_execution_async("--step")
- async def step(self, step_info: Optional[CustomAgentStepInfo] = None) -> None:
- """Execute one step of the task"""
- logger.info(f"\nš Step {self.state.n_steps}")
- state = None
- model_output = None
- result: list[ActionResult] = []
- step_start_time = time.time()
- tokens = 0
-
- try:
- state = await self.browser_context.get_state()
- await self._raise_if_stopped_or_paused()
-
- self.message_manager.add_state_message(state, self.state.last_action, self.state.last_result, step_info,
- self.settings.use_vision)
-
- # Run planner at specified intervals if planner is configured
- if self.settings.planner_llm and self.state.n_steps % self.settings.planner_interval == 0:
- await self._run_planner()
- input_messages = self.message_manager.get_messages()
- tokens = self._message_manager.state.history.current_tokens
-
- try:
- model_output = await self.get_next_action(input_messages)
- self.update_step_info(model_output, step_info)
- self.state.n_steps += 1
-
- if self.register_new_step_callback:
- await self.register_new_step_callback(state, model_output, self.state.n_steps)
-
- if self.settings.save_conversation_path:
- target = self.settings.save_conversation_path + f'_{self.state.n_steps}.txt'
- save_conversation(input_messages, model_output, target,
- self.settings.save_conversation_path_encoding)
-
- if self.model_name != "deepseek-reasoner":
- # remove prev message
- self.message_manager._remove_state_message_by_index(-1)
- await self._raise_if_stopped_or_paused()
- except Exception as e:
- # model call failed, remove last state message from history
- self.message_manager._remove_state_message_by_index(-1)
- raise e
-
- result: list[ActionResult] = await self.multi_act(model_output.action)
- for ret_ in result:
- if ret_.extracted_content and "Extracted page" in ret_.extracted_content:
- # record every extracted page
- if ret_.extracted_content[:100] not in self.state.extracted_content:
- self.state.extracted_content += ret_.extracted_content
- self.state.last_result = result
- self.state.last_action = model_output.action
- if len(result) > 0 and result[-1].is_done:
- if not self.state.extracted_content:
- self.state.extracted_content = step_info.memory
- result[-1].extracted_content = self.state.extracted_content
- logger.info(f"š Result: {result[-1].extracted_content}")
-
- self.state.consecutive_failures = 0
-
- except InterruptedError:
- logger.debug('Agent paused')
- self.state.last_result = [
- ActionResult(
- error='The agent was paused - now continuing actions might need to be repeated',
- include_in_memory=True
- )
- ]
- return
-
- except Exception as e:
- result = await self._handle_step_error(e)
- self.state.last_result = result
-
- finally:
- step_end_time = time.time()
- actions = [a.model_dump(exclude_unset=True) for a in model_output.action] if model_output else []
- self.telemetry.capture(
- AgentStepTelemetryEvent(
- agent_id=self.state.agent_id,
- step=self.state.n_steps,
- actions=actions,
- consecutive_failures=self.state.consecutive_failures,
- step_error=[r.error for r in result if r.error] if result else ['No result'],
- )
- )
- if not result:
- return
-
- if state:
- metadata = StepMetadata(
- step_number=self.state.n_steps,
- step_start_time=step_start_time,
- step_end_time=step_end_time,
- input_tokens=tokens,
- )
- self._make_history_item(model_output, state, result, metadata)
-
- async def run(self, max_steps: int = 100) -> AgentHistoryList:
- """Execute the task with maximum number of steps"""
- try:
- self._log_agent_run()
-
- # Execute initial actions if provided
- if self.initial_actions:
- result = await self.multi_act(self.initial_actions, check_for_new_elements=False)
- self.state.last_result = result
-
- step_info = CustomAgentStepInfo(
- task=self.task,
- add_infos=self.add_infos,
- step_number=1,
- max_steps=max_steps,
- memory="",
- )
-
- for step in range(max_steps):
- # Check if we should stop due to too many failures
- if self.state.consecutive_failures >= self.settings.max_failures:
- logger.error(f'ā Stopping due to {self.settings.max_failures} consecutive failures')
- break
-
- # Check control flags before each step
- if self.state.stopped:
- logger.info('Agent stopped')
- break
-
- while self.state.paused:
- await asyncio.sleep(0.2) # Small delay to prevent CPU spinning
- if self.state.stopped: # Allow stopping while paused
- break
-
- await self.step(step_info)
-
- if self.state.history.is_done():
- if self.settings.validate_output and step < max_steps - 1:
- if not await self._validate_output():
- continue
-
- await self.log_completion()
- break
- else:
- logger.info("ā Failed to complete task in maximum steps")
- if not self.state.extracted_content:
- self.state.history.history[-1].result[-1].extracted_content = step_info.memory
- else:
- self.state.history.history[-1].result[-1].extracted_content = self.state.extracted_content
-
- return self.state.history
-
- finally:
- self.telemetry.capture(
- AgentEndTelemetryEvent(
- agent_id=self.state.agent_id,
- is_done=self.state.history.is_done(),
- success=self.state.history.is_successful(),
- steps=self.state.n_steps,
- max_steps_reached=self.state.n_steps >= max_steps,
- errors=self.state.history.errors(),
- total_input_tokens=self.state.history.total_input_tokens(),
- total_duration_seconds=self.state.history.total_duration_seconds(),
- )
- )
-
- if not self.injected_browser_context:
- await self.browser_context.close()
-
- if not self.injected_browser and self.browser:
- await self.browser.close()
-
- if self.settings.generate_gif:
- output_path: str = 'agent_history.gif'
- if isinstance(self.settings.generate_gif, str):
- output_path = self.settings.generate_gif
-
- create_history_gif(task=self.task, history=self.state.history, output_path=output_path)
diff --git a/src/agent/custom_message_manager.py b/src/agent/custom_message_manager.py
deleted file mode 100644
index 212c3fb..0000000
--- a/src/agent/custom_message_manager.py
+++ /dev/null
@@ -1,109 +0,0 @@
-from __future__ import annotations
-
-import logging
-import pdb
-from typing import List, Optional, Type, Dict
-
-from browser_use.agent.message_manager.service import MessageManager
-from browser_use.agent.message_manager.views import MessageHistory
-from browser_use.agent.prompts import SystemPrompt, AgentMessagePrompt
-from browser_use.agent.views import ActionResult, AgentStepInfo, ActionModel
-from browser_use.browser.views import BrowserState
-from browser_use.agent.message_manager.service import MessageManagerSettings
-from browser_use.agent.views import ActionResult, AgentOutput, AgentStepInfo, MessageManagerState
-from langchain_core.language_models import BaseChatModel
-from langchain_anthropic import ChatAnthropic
-from langchain_core.language_models import BaseChatModel
-from langchain_core.messages import (
- AIMessage,
- BaseMessage,
- HumanMessage,
- ToolMessage,
- SystemMessage
-)
-from langchain_openai import ChatOpenAI
-from ..utils.llm import DeepSeekR1ChatOpenAI
-from .custom_prompts import CustomAgentMessagePrompt
-
-logger = logging.getLogger(__name__)
-
-
-class CustomMessageManagerSettings(MessageManagerSettings):
- agent_prompt_class: Type[AgentMessagePrompt] = AgentMessagePrompt
-
-
-class CustomMessageManager(MessageManager):
- def __init__(
- self,
- task: str,
- system_message: SystemMessage,
- settings: MessageManagerSettings = MessageManagerSettings(),
- state: MessageManagerState = MessageManagerState(),
- ):
- super().__init__(
- task=task,
- system_message=system_message,
- settings=settings,
- state=state
- )
-
- def _init_messages(self) -> None:
- """Initialize the message history with system message, context, task, and other initial messages"""
- self._add_message_with_tokens(self.system_prompt)
- self.context_content = ""
-
- if self.settings.message_context:
- self.context_content += 'Context for the task' + self.settings.message_context
-
- if self.settings.sensitive_data:
- info = f'Here are placeholders for sensitive data: {list(self.settings.sensitive_data.keys())}'
- info += 'To use them, write
for proper display in HTML
+ content = f"{json_string}
"
+
+ except AttributeError as ae:
+ logger.error(
+ f"AttributeError during model dump: {ae}. Check if 'action' or 'current_state' or their items support 'model_dump'."
+ )
+ content = f"Error: Could not format agent output (AttributeError: {ae}).\nRaw output: {str(model_output)}
"
+ except Exception as e:
+ logger.error(f"Error formatting agent output: {e}", exc_info=True)
+ # Fallback to simple string representation on error
+ content = f"Error formatting agent output.\nRaw output:\n{str(model_output)}
"
+
+ return content.strip()
+
+
+# --- Updated Callback Implementation ---
+
+
+async def _handle_new_step(
+ webui_manager: WebuiManager, state: BrowserState, output: AgentOutput, step_num: int
+):
+ """Callback for each step taken by the agent, including screenshot display."""
+
+ # Use the correct chat history attribute name from the user's code
+ if not hasattr(webui_manager, "bu_chat_history"):
+ logger.error(
+ "Attribute 'bu_chat_history' not found in webui_manager! Cannot add chat message."
+ )
+ # Initialize it maybe? Or raise an error? For now, log and potentially skip chat update.
+ webui_manager.bu_chat_history = [] # Initialize if missing (consider if this is the right place)
+ # return # Or stop if this is critical
+ step_num -= 1
+ logger.info(f"Step {step_num} completed.")
+
+ # --- Screenshot Handling ---
+ screenshot_html = ""
+ # Ensure state.screenshot exists and is not empty before proceeding
+ # Use getattr for safer access
+ screenshot_data = getattr(state, "screenshot", None)
+ if screenshot_data:
+ try:
+ # Basic validation: check if it looks like base64
+ if (
+ isinstance(screenshot_data, str) and len(screenshot_data) > 100
+ ): # Arbitrary length check
+ # *** UPDATED STYLE: Removed centering, adjusted width ***
+ img_tag = f'
'
+ screenshot_html = (
+ img_tag + "
"
+ ) # Use
for line break after inline-block image
+ else:
+ logger.warning(
+ f"Screenshot for step {step_num} seems invalid (type: {type(screenshot_data)}, len: {len(screenshot_data) if isinstance(screenshot_data, str) else 'N/A'})."
+ )
+ screenshot_html = "**[Invalid screenshot data]**
"
+
+ except Exception as e:
+ logger.error(
+ f"Error processing or formatting screenshot for step {step_num}: {e}",
+ exc_info=True,
+ )
+ screenshot_html = "**[Error displaying screenshot]**
"
+ else:
+ logger.debug(f"No screenshot available for step {step_num}.")
+
+ # --- Format Agent Output ---
+ formatted_output = _format_agent_output(output) # Use the updated function
+
+ # --- Combine and Append to Chat ---
+ step_header = f"--- **Step {step_num}** ---"
+ # Combine header, image (with line break), and JSON block
+ final_content = step_header + "
" + screenshot_html + formatted_output
+
+ chat_message = {
+ "role": "assistant",
+ "content": final_content.strip(), # Remove leading/trailing whitespace
+ }
+
+ # Append to the correct chat history list
+ webui_manager.bu_chat_history.append(chat_message)
+
+ await asyncio.sleep(0.05)
+
+
+def _handle_done(webui_manager: WebuiManager, history: AgentHistoryList):
+ """Callback when the agent finishes the task (success or failure)."""
+ logger.info(
+ f"Agent task finished. Duration: {history.total_duration_seconds():.2f}s, Tokens: {history.total_input_tokens()}"
+ )
+ final_summary = "**Task Completed**\n"
+ final_summary += f"- Duration: {history.total_duration_seconds():.2f} seconds\n"
+ final_summary += f"- Total Input Tokens: {history.total_input_tokens()}\n" # Or total tokens if available
+
+ final_result = history.final_result()
+ if final_result:
+ final_summary += f"- Final Result: {final_result}\n"
+
+ errors = history.errors()
+ if errors and any(errors):
+ final_summary += f"- **Errors:**\n```\n{errors}\n```\n"
+ else:
+ final_summary += "- Status: Success\n"
+
+ webui_manager.bu_chat_history.append(
+ {"role": "assistant", "content": final_summary}
+ )
+
+
+async def _ask_assistant_callback(
+ webui_manager: WebuiManager, query: str, browser_context: BrowserContext
+) -> Dict[str, Any]:
+ """Callback triggered by the agent's ask_for_assistant action."""
+ logger.info("Agent requires assistance. Waiting for user input.")
+
+ if not hasattr(webui_manager, "_chat_history"):
+ logger.error("Chat history not found in webui_manager during ask_assistant!")
+ return {"response": "Internal Error: Cannot display help request."}
+
+ webui_manager.bu_chat_history.append(
+ {
+ "role": "assistant",
+ "content": f"**Need Help:** {query}\nPlease provide information or perform the required action in the browser, then type your response/confirmation below and click 'Submit Response'.",
+ }
+ )
+
+ # Use state stored in webui_manager
+ webui_manager.bu_response_event = asyncio.Event()
+ webui_manager.bu_user_help_response = None # Reset previous response
+
+ try:
+ logger.info("Waiting for user response event...")
+ await asyncio.wait_for(
+ webui_manager.bu_response_event.wait(), timeout=3600.0
+ ) # Long timeout
+ logger.info("User response event received.")
+ except asyncio.TimeoutError:
+ logger.warning("Timeout waiting for user assistance.")
+ webui_manager.bu_chat_history.append(
+ {
+ "role": "assistant",
+ "content": "**Timeout:** No response received. Trying to proceed.",
+ }
+ )
+ webui_manager.bu_response_event = None # Clear the event
+ return {"response": "Timeout: User did not respond."} # Inform the agent
+
+ response = webui_manager.bu_user_help_response
+ webui_manager.bu_chat_history.append(
+ {"role": "user", "content": response}
+ ) # Show user response in chat
+ webui_manager.bu_response_event = (
+ None # Clear the event for the next potential request
+ )
+ return {"response": response}
+
+
+# --- Core Agent Execution Logic --- (Needs access to webui_manager)
+
+
+async def run_agent_task(
+ webui_manager: WebuiManager, components: Dict[gr.components.Component, Any]
+) -> AsyncGenerator[Dict[gr.components.Component, Any], None]:
+ """Handles the entire lifecycle of initializing and running the agent."""
+
+ # --- Get Components ---
+ # Need handles to specific UI components to update them
+ user_input_comp = webui_manager.get_component_by_id("browser_use_agent.user_input")
+ run_button_comp = webui_manager.get_component_by_id("browser_use_agent.run_button")
+ stop_button_comp = webui_manager.get_component_by_id(
+ "browser_use_agent.stop_button"
+ )
+ pause_resume_button_comp = webui_manager.get_component_by_id(
+ "browser_use_agent.pause_resume_button"
+ )
+ clear_button_comp = webui_manager.get_component_by_id(
+ "browser_use_agent.clear_button"
+ )
+ chatbot_comp = webui_manager.get_component_by_id("browser_use_agent.chatbot")
+ history_file_comp = webui_manager.get_component_by_id(
+ "browser_use_agent.agent_history_file"
+ )
+ gif_comp = webui_manager.get_component_by_id("browser_use_agent.recording_gif")
+ browser_view_comp = webui_manager.get_component_by_id(
+ "browser_use_agent.browser_view"
+ )
+
+ # --- 1. Get Task and Initial UI Update ---
+ task = components.get(user_input_comp, "").strip()
+ if not task:
+ gr.Warning("Please enter a task.")
+ yield {run_button_comp: gr.update(interactive=True)}
+ return
+
+ # Set running state indirectly via _current_task
+ webui_manager.bu_chat_history.append({"role": "user", "content": task})
+
+ yield {
+ user_input_comp: gr.Textbox(
+ value="", interactive=False, placeholder="Agent is running..."
+ ),
+ run_button_comp: gr.Button(value="ā³ Running...", interactive=False),
+ stop_button_comp: gr.Button(interactive=True),
+ pause_resume_button_comp: gr.Button(value="āøļø Pause", interactive=True),
+ clear_button_comp: gr.Button(interactive=False),
+ chatbot_comp: gr.update(value=webui_manager.bu_chat_history),
+ history_file_comp: gr.update(value=None),
+ gif_comp: gr.update(value=None),
+ }
+
+ # --- Agent Settings ---
+ # Access settings values via components dict, getting IDs from webui_manager
+ def get_setting(key, default=None):
+ comp = webui_manager.id_to_component.get(f"agent_settings.{key}")
+ return components.get(comp, default) if comp else default
+
+ override_system_prompt = get_setting("override_system_prompt") or None
+ extend_system_prompt = get_setting("extend_system_prompt") or None
+ llm_provider_name = get_setting(
+ "llm_provider", None
+ ) # Default to None if not found
+ llm_model_name = get_setting("llm_model_name", None)
+ llm_temperature = get_setting("llm_temperature", 0.6)
+ use_vision = get_setting("use_vision", True)
+ ollama_num_ctx = get_setting("ollama_num_ctx", 16000)
+ llm_base_url = get_setting("llm_base_url") or None
+ llm_api_key = get_setting("llm_api_key") or None
+ max_steps = get_setting("max_steps", 100)
+ max_actions = get_setting("max_actions", 10)
+ max_input_tokens = get_setting("max_input_tokens", 128000)
+ tool_calling_str = get_setting("tool_calling_method", "auto")
+ tool_calling_method = tool_calling_str if tool_calling_str != "None" else None
+ mcp_server_config_comp = webui_manager.id_to_component.get(
+ "agent_settings.mcp_server_config"
+ )
+ mcp_server_config_str = (
+ components.get(mcp_server_config_comp) if mcp_server_config_comp else None
+ )
+ mcp_server_config = (
+ json.loads(mcp_server_config_str) if mcp_server_config_str else None
+ )
+
+ # Planner LLM Settings (Optional)
+ planner_llm_provider_name = get_setting("planner_llm_provider") or None
+ planner_llm = None
+ planner_use_vision = False
+ if planner_llm_provider_name:
+ planner_llm_model_name = get_setting("planner_llm_model_name")
+ planner_llm_temperature = get_setting("planner_llm_temperature", 0.6)
+ planner_ollama_num_ctx = get_setting("planner_ollama_num_ctx", 16000)
+ planner_llm_base_url = get_setting("planner_llm_base_url") or None
+ planner_llm_api_key = get_setting("planner_llm_api_key") or None
+ planner_use_vision = get_setting("planner_use_vision", False)
+
+ planner_llm = await _initialize_llm(
+ planner_llm_provider_name,
+ planner_llm_model_name,
+ planner_llm_temperature,
+ planner_llm_base_url,
+ planner_llm_api_key,
+ planner_ollama_num_ctx if planner_llm_provider_name == "ollama" else None,
+ )
+
+ # --- Browser Settings ---
+ def get_browser_setting(key, default=None):
+ comp = webui_manager.id_to_component.get(f"browser_settings.{key}")
+ return components.get(comp, default) if comp else default
+
+ browser_binary_path = get_browser_setting("browser_binary_path") or None
+ browser_user_data_dir = get_browser_setting("browser_user_data_dir") or None
+ use_own_browser = get_browser_setting(
+ "use_own_browser", False
+ ) # Logic handled by CDP/WSS presence
+ keep_browser_open = get_browser_setting("keep_browser_open", False)
+ headless = get_browser_setting("headless", False)
+ disable_security = get_browser_setting("disable_security", False)
+ window_w = int(get_browser_setting("window_w", 1280))
+ window_h = int(get_browser_setting("window_h", 1100))
+ cdp_url = get_browser_setting("cdp_url") or None
+ wss_url = get_browser_setting("wss_url") or None
+ save_recording_path = get_browser_setting("save_recording_path") or None
+ save_trace_path = get_browser_setting("save_trace_path") or None
+ save_agent_history_path = get_browser_setting(
+ "save_agent_history_path", "./tmp/agent_history"
+ )
+ save_download_path = get_browser_setting("save_download_path", "./tmp/downloads")
+
+ stream_vw = 70
+ stream_vh = int(70 * window_h // window_w)
+
+ os.makedirs(save_agent_history_path, exist_ok=True)
+ if save_recording_path:
+ os.makedirs(save_recording_path, exist_ok=True)
+ if save_trace_path:
+ os.makedirs(save_trace_path, exist_ok=True)
+ if save_download_path:
+ os.makedirs(save_download_path, exist_ok=True)
+
+ # --- 2. Initialize LLM ---
+ main_llm = await _initialize_llm(
+ llm_provider_name,
+ llm_model_name,
+ llm_temperature,
+ llm_base_url,
+ llm_api_key,
+ ollama_num_ctx if llm_provider_name == "ollama" else None,
+ )
+
+ # Pass the webui_manager instance to the callback when wrapping it
+ async def ask_callback_wrapper(
+ query: str, browser_context: BrowserContext
+ ) -> Dict[str, Any]:
+ return await _ask_assistant_callback(webui_manager, query, browser_context)
+
+ if not webui_manager.bu_controller:
+ webui_manager.bu_controller = CustomController(
+ ask_assistant_callback=ask_callback_wrapper
+ )
+ await webui_manager.bu_controller.setup_mcp_client(mcp_server_config)
+
+ # --- 4. Initialize Browser and Context ---
+ should_close_browser_on_finish = not keep_browser_open
+
+ try:
+ # Close existing resources if not keeping open
+ if not keep_browser_open:
+ if webui_manager.bu_browser_context:
+ logger.info("Closing previous browser context.")
+ await webui_manager.bu_browser_context.close()
+ webui_manager.bu_browser_context = None
+ if webui_manager.bu_browser:
+ logger.info("Closing previous browser.")
+ await webui_manager.bu_browser.close()
+ webui_manager.bu_browser = None
+
+ # Create Browser if needed
+ if not webui_manager.bu_browser:
+ logger.info("Launching new browser instance.")
+ extra_args = [f"--window-size={window_w},{window_h}"]
+ if use_own_browser:
+ browser_binary_path = os.getenv("BROWSER_PATH", None) or browser_binary_path
+ if browser_binary_path == "":
+ browser_binary_path = None
+ browser_user_data = browser_user_data_dir or os.getenv("BROWSER_USER_DATA", None)
+ if browser_user_data:
+ extra_args += [f"--user-data-dir={browser_user_data}"]
+ else:
+ browser_binary_path = None
+
+ webui_manager.bu_browser = CustomBrowser(
+ config=BrowserConfig(
+ headless=headless,
+ disable_security=disable_security,
+ browser_binary_path=browser_binary_path,
+ extra_browser_args=extra_args,
+ wss_url=wss_url,
+ cdp_url=cdp_url,
+ )
+ )
+
+ # Create Context if needed
+ if not webui_manager.bu_browser_context:
+ logger.info("Creating new browser context.")
+ context_config = BrowserContextConfig(
+ trace_path=save_trace_path if save_trace_path else None,
+ save_recording_path=save_recording_path
+ if save_recording_path
+ else None,
+ save_downloads_path=save_download_path if save_download_path else None,
+ window_height=window_h,
+ window_width=window_w,
+ )
+ if not webui_manager.bu_browser:
+ raise ValueError("Browser not initialized, cannot create context.")
+ webui_manager.bu_browser_context = (
+ await webui_manager.bu_browser.new_context(config=context_config)
+ )
+
+ # --- 5. Initialize or Update Agent ---
+ webui_manager.bu_agent_task_id = str(uuid.uuid4()) # New ID for this task run
+ os.makedirs(
+ os.path.join(save_agent_history_path, webui_manager.bu_agent_task_id),
+ exist_ok=True,
+ )
+ history_file = os.path.join(
+ save_agent_history_path,
+ webui_manager.bu_agent_task_id,
+ f"{webui_manager.bu_agent_task_id}.json",
+ )
+ gif_path = os.path.join(
+ save_agent_history_path,
+ webui_manager.bu_agent_task_id,
+ f"{webui_manager.bu_agent_task_id}.gif",
+ )
+
+ # Pass the webui_manager to callbacks when wrapping them
+ async def step_callback_wrapper(
+ state: BrowserState, output: AgentOutput, step_num: int
+ ):
+ await _handle_new_step(webui_manager, state, output, step_num)
+
+ def done_callback_wrapper(history: AgentHistoryList):
+ _handle_done(webui_manager, history)
+
+ if not webui_manager.bu_agent:
+ logger.info(f"Initializing new agent for task: {task}")
+ if not webui_manager.bu_browser or not webui_manager.bu_browser_context:
+ raise ValueError(
+ "Browser or Context not initialized, cannot create agent."
+ )
+ webui_manager.bu_agent = BrowserUseAgent(
+ task=task,
+ llm=main_llm,
+ browser=webui_manager.bu_browser,
+ browser_context=webui_manager.bu_browser_context,
+ controller=webui_manager.bu_controller,
+ register_new_step_callback=step_callback_wrapper,
+ register_done_callback=done_callback_wrapper,
+ use_vision=use_vision,
+ override_system_message=override_system_prompt,
+ extend_system_message=extend_system_prompt,
+ max_input_tokens=max_input_tokens,
+ max_actions_per_step=max_actions,
+ tool_calling_method=tool_calling_method,
+ planner_llm=planner_llm,
+ use_vision_for_planner=planner_use_vision if planner_llm else False,
+ source="webui",
+ )
+ webui_manager.bu_agent.state.agent_id = webui_manager.bu_agent_task_id
+ webui_manager.bu_agent.settings.generate_gif = gif_path
+ else:
+ webui_manager.bu_agent.state.agent_id = webui_manager.bu_agent_task_id
+ webui_manager.bu_agent.add_new_task(task)
+ webui_manager.bu_agent.settings.generate_gif = gif_path
+ webui_manager.bu_agent.browser = webui_manager.bu_browser
+ webui_manager.bu_agent.browser_context = webui_manager.bu_browser_context
+ webui_manager.bu_agent.controller = webui_manager.bu_controller
+
+ # --- 6. Run Agent Task and Stream Updates ---
+ agent_run_coro = webui_manager.bu_agent.run(max_steps=max_steps)
+ agent_task = asyncio.create_task(agent_run_coro)
+ webui_manager.bu_current_task = agent_task # Store the task
+
+ last_chat_len = len(webui_manager.bu_chat_history)
+ while not agent_task.done():
+ is_paused = webui_manager.bu_agent.state.paused
+ is_stopped = webui_manager.bu_agent.state.stopped
+
+ # Check for pause state
+ if is_paused:
+ yield {
+ pause_resume_button_comp: gr.update(
+ value="ā¶ļø Resume", interactive=True
+ ),
+ stop_button_comp: gr.update(interactive=True),
+ }
+ # Wait until pause is released or task is stopped/done
+ while is_paused and not agent_task.done():
+ # Re-check agent state in loop
+ is_paused = webui_manager.bu_agent.state.paused
+ is_stopped = webui_manager.bu_agent.state.stopped
+ if is_stopped: # Stop signal received while paused
+ break
+ await asyncio.sleep(0.2)
+
+ if (
+ agent_task.done() or is_stopped
+ ): # If stopped or task finished while paused
+ break
+
+ # If resumed, yield UI update
+ yield {
+ pause_resume_button_comp: gr.update(
+ value="āøļø Pause", interactive=True
+ ),
+ run_button_comp: gr.update(
+ value="ā³ Running...", interactive=False
+ ),
+ }
+
+ # Check if agent stopped itself or stop button was pressed (which sets agent.state.stopped)
+ if is_stopped:
+ logger.info("Agent has stopped (internally or via stop button).")
+ if not agent_task.done():
+ # Ensure the task coroutine finishes if agent just set flag
+ try:
+ await asyncio.wait_for(
+ agent_task, timeout=1.0
+ ) # Give it a moment to exit run()
+ except asyncio.TimeoutError:
+ logger.warning(
+ "Agent task did not finish quickly after stop signal, cancelling."
+ )
+ agent_task.cancel()
+ except Exception: # Catch task exceptions if it errors on stop
+ pass
+ break # Exit the streaming loop
+
+ # Check if agent is asking for help (via response_event)
+ update_dict = {}
+ if webui_manager.bu_response_event is not None:
+ update_dict = {
+ user_input_comp: gr.update(
+ placeholder="Agent needs help. Enter response and submit.",
+ interactive=True,
+ ),
+ run_button_comp: gr.update(
+ value="āļø Submit Response", interactive=True
+ ),
+ pause_resume_button_comp: gr.update(interactive=False),
+ stop_button_comp: gr.update(interactive=False),
+ chatbot_comp: gr.update(value=webui_manager.bu_chat_history),
+ }
+ last_chat_len = len(webui_manager.bu_chat_history)
+ yield update_dict
+ # Wait until response is submitted or task finishes
+ while (
+ webui_manager.bu_response_event is not None
+ and not agent_task.done()
+ ):
+ await asyncio.sleep(0.2)
+ # Restore UI after response submitted or if task ended unexpectedly
+ if not agent_task.done():
+ yield {
+ user_input_comp: gr.update(
+ placeholder="Agent is running...", interactive=False
+ ),
+ run_button_comp: gr.update(
+ value="ā³ Running...", interactive=False
+ ),
+ pause_resume_button_comp: gr.update(interactive=True),
+ stop_button_comp: gr.update(interactive=True),
+ }
+ else:
+ break # Task finished while waiting for response
+
+ # Update Chatbot if new messages arrived via callbacks
+ if len(webui_manager.bu_chat_history) > last_chat_len:
+ update_dict[chatbot_comp] = gr.update(
+ value=webui_manager.bu_chat_history
+ )
+ last_chat_len = len(webui_manager.bu_chat_history)
+
+ # Update Browser View
+ if headless and webui_manager.bu_browser_context:
+ try:
+ screenshot_b64 = (
+ await webui_manager.bu_browser_context.take_screenshot()
+ )
+ if screenshot_b64:
+ html_content = f'
'
+ update_dict[browser_view_comp] = gr.update(
+ value=html_content, visible=True
+ )
+ else:
+ html_content = f"Waiting for browser session...
"
+ update_dict[browser_view_comp] = gr.update(
+ value=html_content, visible=True
+ )
+ except Exception as e:
+ logger.debug(f"Failed to capture screenshot: {e}")
+ update_dict[browser_view_comp] = gr.update(
+ value="Error loading view...",
+ visible=True,
+ )
+ else:
+ update_dict[browser_view_comp] = gr.update(visible=False)
+
+ # Yield accumulated updates
+ if update_dict:
+ yield update_dict
+
+ await asyncio.sleep(0.1) # Polling interval
+
+ # --- 7. Task Finalization ---
+ webui_manager.bu_agent.state.paused = False
+ webui_manager.bu_agent.state.stopped = False
+ final_update = {}
+ try:
+ logger.info("Agent task completing...")
+ # Await the task ensure completion and catch exceptions if not already caught
+ if not agent_task.done():
+ await agent_task # Retrieve result/exception
+ elif agent_task.exception(): # Check if task finished with exception
+ agent_task.result() # Raise the exception to be caught below
+ logger.info("Agent task completed processing.")
+
+ logger.info(f"Explicitly saving agent history to: {history_file}")
+ webui_manager.bu_agent.save_history(history_file)
+
+ if os.path.exists(history_file):
+ final_update[history_file_comp] = gr.File(value=history_file)
+
+ if gif_path and os.path.exists(gif_path):
+ logger.info(f"GIF found at: {gif_path}")
+ final_update[gif_comp] = gr.Image(value=gif_path)
+
+ except asyncio.CancelledError:
+ logger.info("Agent task was cancelled.")
+ if not any(
+ "Cancelled" in msg.get("content", "")
+ for msg in webui_manager.bu_chat_history
+ if msg.get("role") == "assistant"
+ ):
+ webui_manager.bu_chat_history.append(
+ {"role": "assistant", "content": "**Task Cancelled**."}
+ )
+ final_update[chatbot_comp] = gr.update(value=webui_manager.bu_chat_history)
+ except Exception as e:
+ logger.error(f"Error during agent execution: {e}", exc_info=True)
+ error_message = (
+ f"**Agent Execution Error:**\n```\n{type(e).__name__}: {e}\n```"
+ )
+ if not any(
+ error_message in msg.get("content", "")
+ for msg in webui_manager.bu_chat_history
+ if msg.get("role") == "assistant"
+ ):
+ webui_manager.bu_chat_history.append(
+ {"role": "assistant", "content": error_message}
+ )
+ final_update[chatbot_comp] = gr.update(value=webui_manager.bu_chat_history)
+ gr.Error(f"Agent execution failed: {e}")
+
+ finally:
+ webui_manager.bu_current_task = None # Clear the task reference
+
+ # Close browser/context if requested
+ if should_close_browser_on_finish:
+ if webui_manager.bu_browser_context:
+ logger.info("Closing browser context after task.")
+ await webui_manager.bu_browser_context.close()
+ webui_manager.bu_browser_context = None
+ if webui_manager.bu_browser:
+ logger.info("Closing browser after task.")
+ await webui_manager.bu_browser.close()
+ webui_manager.bu_browser = None
+
+ # --- 8. Final UI Update ---
+ final_update.update(
+ {
+ user_input_comp: gr.update(
+ value="",
+ interactive=True,
+ placeholder="Enter your next task...",
+ ),
+ run_button_comp: gr.update(value="ā¶ļø Submit Task", interactive=True),
+ stop_button_comp: gr.update(value="ā¹ļø Stop", interactive=False),
+ pause_resume_button_comp: gr.update(
+ value="āøļø Pause", interactive=False
+ ),
+ clear_button_comp: gr.update(interactive=True),
+ # Ensure final chat history is shown
+ chatbot_comp: gr.update(value=webui_manager.bu_chat_history),
+ }
+ )
+ yield final_update
+
+ except Exception as e:
+ # Catch errors during setup (before agent run starts)
+ logger.error(f"Error setting up agent task: {e}", exc_info=True)
+ webui_manager.bu_current_task = None # Ensure state is reset
+ yield {
+ user_input_comp: gr.update(
+ interactive=True, placeholder="Error during setup. Enter task..."
+ ),
+ run_button_comp: gr.update(value="ā¶ļø Submit Task", interactive=True),
+ stop_button_comp: gr.update(value="ā¹ļø Stop", interactive=False),
+ pause_resume_button_comp: gr.update(value="āøļø Pause", interactive=False),
+ clear_button_comp: gr.update(interactive=True),
+ chatbot_comp: gr.update(
+ value=webui_manager.bu_chat_history
+ + [{"role": "assistant", "content": f"**Setup Error:** {e}"}]
+ ),
+ }
+
+
+# --- Button Click Handlers --- (Need access to webui_manager)
+
+
+async def handle_submit(
+ webui_manager: WebuiManager, components: Dict[gr.components.Component, Any]
+):
+ """Handles clicks on the main 'Submit' button."""
+ user_input_comp = webui_manager.get_component_by_id("browser_use_agent.user_input")
+ user_input_value = components.get(user_input_comp, "").strip()
+
+ # Check if waiting for user assistance
+ if webui_manager.bu_response_event and not webui_manager.bu_response_event.is_set():
+ logger.info(f"User submitted assistance: {user_input_value}")
+ webui_manager.bu_user_help_response = (
+ user_input_value if user_input_value else "User provided no text response."
+ )
+ webui_manager.bu_response_event.set()
+ # UI updates handled by the main loop reacting to the event being set
+ yield {
+ user_input_comp: gr.update(
+ value="",
+ interactive=False,
+ placeholder="Waiting for agent to continue...",
+ ),
+ webui_manager.get_component_by_id(
+ "browser_use_agent.run_button"
+ ): gr.update(value="ā³ Running...", interactive=False),
+ }
+ # Check if a task is currently running (using _current_task)
+ elif webui_manager.bu_current_task and not webui_manager.bu_current_task.done():
+ logger.warning(
+ "Submit button clicked while agent is already running and not asking for help."
+ )
+ gr.Info("Agent is currently running. Please wait or use Stop/Pause.")
+ yield {} # No change
+ else:
+ # Handle submission for a new task
+ logger.info("Submit button clicked for new task.")
+ # Use async generator to stream updates from run_agent_task
+ async for update in run_agent_task(webui_manager, components):
+ yield update
+
+
+async def handle_stop(webui_manager: WebuiManager):
+ """Handles clicks on the 'Stop' button."""
+ logger.info("Stop button clicked.")
+ agent = webui_manager.bu_agent
+ task = webui_manager.bu_current_task
+
+ if agent and task and not task.done():
+ # Signal the agent to stop by setting its internal flag
+ agent.state.stopped = True
+ agent.state.paused = False # Ensure not paused if stopped
+ return {
+ webui_manager.get_component_by_id(
+ "browser_use_agent.stop_button"
+ ): gr.update(interactive=False, value="ā¹ļø Stopping..."),
+ webui_manager.get_component_by_id(
+ "browser_use_agent.pause_resume_button"
+ ): gr.update(interactive=False),
+ webui_manager.get_component_by_id(
+ "browser_use_agent.run_button"
+ ): gr.update(interactive=False),
+ }
+ else:
+ logger.warning("Stop clicked but agent is not running or task is already done.")
+ # Reset UI just in case it's stuck
+ return {
+ webui_manager.get_component_by_id(
+ "browser_use_agent.run_button"
+ ): gr.update(interactive=True),
+ webui_manager.get_component_by_id(
+ "browser_use_agent.stop_button"
+ ): gr.update(interactive=False),
+ webui_manager.get_component_by_id(
+ "browser_use_agent.pause_resume_button"
+ ): gr.update(interactive=False),
+ webui_manager.get_component_by_id(
+ "browser_use_agent.clear_button"
+ ): gr.update(interactive=True),
+ }
+
+
+async def handle_pause_resume(webui_manager: WebuiManager):
+ """Handles clicks on the 'Pause/Resume' button."""
+ agent = webui_manager.bu_agent
+ task = webui_manager.bu_current_task
+
+ if agent and task and not task.done():
+ if agent.state.paused:
+ logger.info("Resume button clicked.")
+ agent.resume()
+ # UI update happens in main loop
+ return {
+ webui_manager.get_component_by_id(
+ "browser_use_agent.pause_resume_button"
+ ): gr.update(value="āøļø Pause", interactive=True)
+ } # Optimistic update
+ else:
+ logger.info("Pause button clicked.")
+ agent.pause()
+ return {
+ webui_manager.get_component_by_id(
+ "browser_use_agent.pause_resume_button"
+ ): gr.update(value="ā¶ļø Resume", interactive=True)
+ } # Optimistic update
+ else:
+ logger.warning(
+ "Pause/Resume clicked but agent is not running or doesn't support state."
+ )
+ return {} # No change
+
+
+async def handle_clear(webui_manager: WebuiManager):
+ """Handles clicks on the 'Clear' button."""
+ logger.info("Clear button clicked.")
+
+ # Stop any running task first
+ task = webui_manager.bu_current_task
+ if task and not task.done():
+ logger.info("Clearing requires stopping the current task.")
+ webui_manager.bu_agent.stop()
+ task.cancel()
+ try:
+ await asyncio.wait_for(task, timeout=2.0) # Wait briefly
+ except (asyncio.CancelledError, asyncio.TimeoutError):
+ pass
+ except Exception as e:
+ logger.warning(f"Error stopping task on clear: {e}")
+ webui_manager.bu_current_task = None
+
+ if webui_manager.bu_controller:
+ await webui_manager.bu_controller.close_mcp_client()
+ webui_manager.bu_controller = None
+ webui_manager.bu_agent = None
+
+ # Reset state stored in manager
+ webui_manager.bu_chat_history = []
+ webui_manager.bu_response_event = None
+ webui_manager.bu_user_help_response = None
+ webui_manager.bu_agent_task_id = None
+
+ logger.info("Agent state and browser resources cleared.")
+
+ # Reset UI components
+ return {
+ webui_manager.get_component_by_id("browser_use_agent.chatbot"): gr.update(
+ value=[]
+ ),
+ webui_manager.get_component_by_id("browser_use_agent.user_input"): gr.update(
+ value="", placeholder="Enter your task here..."
+ ),
+ webui_manager.get_component_by_id(
+ "browser_use_agent.agent_history_file"
+ ): gr.update(value=None),
+ webui_manager.get_component_by_id("browser_use_agent.recording_gif"): gr.update(
+ value=None
+ ),
+ webui_manager.get_component_by_id("browser_use_agent.browser_view"): gr.update(
+ value="Browser Cleared"
+ ),
+ webui_manager.get_component_by_id("browser_use_agent.run_button"): gr.update(
+ value="ā¶ļø Submit Task", interactive=True
+ ),
+ webui_manager.get_component_by_id("browser_use_agent.stop_button"): gr.update(
+ interactive=False
+ ),
+ webui_manager.get_component_by_id(
+ "browser_use_agent.pause_resume_button"
+ ): gr.update(value="āøļø Pause", interactive=False),
+ webui_manager.get_component_by_id("browser_use_agent.clear_button"): gr.update(
+ interactive=True
+ ),
+ }
+
+
+# --- Tab Creation Function ---
+
+
+def create_browser_use_agent_tab(webui_manager: WebuiManager):
+ """
+ Create the run agent tab, defining UI, state, and handlers.
+ """
+ webui_manager.init_browser_use_agent()
+
+ # --- Define UI Components ---
+ tab_components = {}
+ with gr.Column():
+ chatbot = gr.Chatbot(
+ lambda: webui_manager.bu_chat_history, # Load history dynamically
+ elem_id="browser_use_chatbot",
+ label="Agent Interaction",
+ type="messages",
+ height=600,
+ show_copy_button=True,
+ )
+ user_input = gr.Textbox(
+ label="Your Task or Response",
+ placeholder="Enter your task here or provide assistance when asked.",
+ lines=3,
+ interactive=True,
+ elem_id="user_input",
+ )
+ with gr.Row():
+ stop_button = gr.Button(
+ "ā¹ļø Stop", interactive=False, variant="stop", scale=2
+ )
+ pause_resume_button = gr.Button(
+ "āøļø Pause", interactive=False, variant="secondary", scale=2, visible=True
+ )
+ clear_button = gr.Button(
+ "šļø Clear", interactive=True, variant="secondary", scale=2
+ )
+ run_button = gr.Button("ā¶ļø Submit Task", variant="primary", scale=3)
+
+ browser_view = gr.HTML(
+ value="Browser View (Requires Headless=True)
",
+ label="Browser Live View",
+ elem_id="browser_view",
+ visible=False,
+ )
+ with gr.Column():
+ gr.Markdown("### Task Outputs")
+ agent_history_file = gr.File(label="Agent History JSON", interactive=False)
+ recording_gif = gr.Image(
+ label="Task Recording GIF",
+ format="gif",
+ interactive=False,
+ type="filepath",
+ )
+
+ # --- Store Components in Manager ---
+ tab_components.update(
+ dict(
+ chatbot=chatbot,
+ user_input=user_input,
+ clear_button=clear_button,
+ run_button=run_button,
+ stop_button=stop_button,
+ pause_resume_button=pause_resume_button,
+ agent_history_file=agent_history_file,
+ recording_gif=recording_gif,
+ browser_view=browser_view,
+ )
+ )
+ webui_manager.add_components(
+ "browser_use_agent", tab_components
+ ) # Use "browser_use_agent" as tab_name prefix
+
+ all_managed_components = set(
+ webui_manager.get_components()
+ ) # Get all components known to manager
+ run_tab_outputs = list(tab_components.values())
+
+ async def submit_wrapper(
+ components_dict: Dict[Component, Any],
+ ) -> AsyncGenerator[Dict[Component, Any], None]:
+ """Wrapper for handle_submit that yields its results."""
+ async for update in handle_submit(webui_manager, components_dict):
+ yield update
+
+ async def stop_wrapper() -> AsyncGenerator[Dict[Component, Any], None]:
+ """Wrapper for handle_stop."""
+ update_dict = await handle_stop(webui_manager)
+ yield update_dict
+
+ async def pause_resume_wrapper() -> AsyncGenerator[Dict[Component, Any], None]:
+ """Wrapper for handle_pause_resume."""
+ update_dict = await handle_pause_resume(webui_manager)
+ yield update_dict
+
+ async def clear_wrapper() -> AsyncGenerator[Dict[Component, Any], None]:
+ """Wrapper for handle_clear."""
+ update_dict = await handle_clear(webui_manager)
+ yield update_dict
+
+ # --- Connect Event Handlers using the Wrappers --
+ run_button.click(
+ fn=submit_wrapper, inputs=all_managed_components, outputs=run_tab_outputs
+ )
+ user_input.submit(
+ fn=submit_wrapper, inputs=all_managed_components, outputs=run_tab_outputs
+ )
+ stop_button.click(fn=stop_wrapper, inputs=None, outputs=run_tab_outputs)
+ pause_resume_button.click(
+ fn=pause_resume_wrapper, inputs=None, outputs=run_tab_outputs
+ )
+ clear_button.click(fn=clear_wrapper, inputs=None, outputs=run_tab_outputs)
diff --git a/src/webui/components/deep_research_agent_tab.py b/src/webui/components/deep_research_agent_tab.py
new file mode 100644
index 0000000..ff455b5
--- /dev/null
+++ b/src/webui/components/deep_research_agent_tab.py
@@ -0,0 +1,451 @@
+import gradio as gr
+from gradio.components import Component
+from functools import partial
+
+from src.webui.webui_manager import WebuiManager
+from src.utils import config
+import logging
+import os
+from typing import Any, Dict, AsyncGenerator, Optional, Tuple, Union
+import asyncio
+import json
+from src.agent.deep_research.deep_research_agent import DeepResearchAgent
+from src.utils import llm_provider
+
+logger = logging.getLogger(__name__)
+
+
+async def _initialize_llm(provider: Optional[str], model_name: Optional[str], temperature: float,
+ base_url: Optional[str], api_key: Optional[str], num_ctx: Optional[int] = None):
+ """Initializes the LLM based on settings. Returns None if provider/model is missing."""
+ if not provider or not model_name:
+ logger.info("LLM Provider or Model Name not specified, LLM will be None.")
+ return None
+ try:
+ logger.info(f"Initializing LLM: Provider={provider}, Model={model_name}, Temp={temperature}")
+ # Use your actual LLM provider logic here
+ llm = llm_provider.get_llm_model(
+ provider=provider,
+ model_name=model_name,
+ temperature=temperature,
+ base_url=base_url or None,
+ api_key=api_key or None,
+ num_ctx=num_ctx if provider == "ollama" else None
+ )
+ return llm
+ except Exception as e:
+ logger.error(f"Failed to initialize LLM: {e}", exc_info=True)
+ gr.Warning(
+ f"Failed to initialize LLM '{model_name}' for provider '{provider}'. Please check settings. Error: {e}")
+ return None
+
+
+def _read_file_safe(file_path: str) -> Optional[str]:
+ """Safely read a file, returning None if it doesn't exist or on error."""
+ if not os.path.exists(file_path):
+ return None
+ try:
+ with open(file_path, 'r', encoding='utf-8') as f:
+ return f.read()
+ except Exception as e:
+ logger.error(f"Error reading file {file_path}: {e}")
+ return None
+
+
+# --- Deep Research Agent Specific Logic ---
+
+async def run_deep_research(webui_manager: WebuiManager, components: Dict[Component, Any]) -> AsyncGenerator[
+ Dict[Component, Any], None]:
+ """Handles initializing and running the DeepResearchAgent."""
+
+ # --- Get Components ---
+ research_task_comp = webui_manager.get_component_by_id("deep_research_agent.research_task")
+ resume_task_id_comp = webui_manager.get_component_by_id("deep_research_agent.resume_task_id")
+ parallel_num_comp = webui_manager.get_component_by_id("deep_research_agent.parallel_num")
+ save_dir_comp = webui_manager.get_component_by_id(
+ "deep_research_agent.max_query") # Note: component ID seems misnamed in original code
+ start_button_comp = webui_manager.get_component_by_id("deep_research_agent.start_button")
+ stop_button_comp = webui_manager.get_component_by_id("deep_research_agent.stop_button")
+ markdown_display_comp = webui_manager.get_component_by_id("deep_research_agent.markdown_display")
+ markdown_download_comp = webui_manager.get_component_by_id("deep_research_agent.markdown_download")
+ mcp_server_config_comp = webui_manager.get_component_by_id("deep_research_agent.mcp_server_config")
+
+ # --- 1. Get Task and Settings ---
+ task_topic = components.get(research_task_comp, "").strip()
+ task_id_to_resume = components.get(resume_task_id_comp, "").strip() or None
+ max_parallel_agents = int(components.get(parallel_num_comp, 1))
+ base_save_dir = components.get(save_dir_comp, "./tmp/deep_research")
+ mcp_server_config_str = components.get(mcp_server_config_comp)
+ mcp_config = json.loads(mcp_server_config_str) if mcp_server_config_str else None
+
+ if not task_topic:
+ gr.Warning("Please enter a research task.")
+ yield {start_button_comp: gr.update(interactive=True)} # Re-enable start button
+ return
+
+ # Store base save dir for stop handler
+ webui_manager.dr_save_dir = base_save_dir
+ os.makedirs(base_save_dir, exist_ok=True)
+
+ # --- 2. Initial UI Update ---
+ yield {
+ start_button_comp: gr.update(value="ā³ Running...", interactive=False),
+ stop_button_comp: gr.update(interactive=True),
+ research_task_comp: gr.update(interactive=False),
+ resume_task_id_comp: gr.update(interactive=False),
+ parallel_num_comp: gr.update(interactive=False),
+ save_dir_comp: gr.update(interactive=False),
+ markdown_display_comp: gr.update(value="Starting research..."),
+ markdown_download_comp: gr.update(value=None, interactive=False)
+ }
+
+ agent_task = None
+ running_task_id = None
+ plan_file_path = None
+ report_file_path = None
+ last_plan_content = None
+ last_plan_mtime = 0
+
+ try:
+ # --- 3. Get LLM and Browser Config from other tabs ---
+ # Access settings values via components dict, getting IDs from webui_manager
+ def get_setting(tab: str, key: str, default: Any = None):
+ comp = webui_manager.id_to_component.get(f"{tab}.{key}")
+ return components.get(comp, default) if comp else default
+
+ # LLM Config (from agent_settings tab)
+ llm_provider_name = get_setting("agent_settings", "llm_provider")
+ llm_model_name = get_setting("agent_settings", "llm_model_name")
+ llm_temperature = max(get_setting("agent_settings", "llm_temperature", 0.5), 0.5)
+ llm_base_url = get_setting("agent_settings", "llm_base_url")
+ llm_api_key = get_setting("agent_settings", "llm_api_key")
+ ollama_num_ctx = get_setting("agent_settings", "ollama_num_ctx")
+
+ llm = await _initialize_llm(
+ llm_provider_name, llm_model_name, llm_temperature, llm_base_url, llm_api_key,
+ ollama_num_ctx if llm_provider_name == "ollama" else None
+ )
+ if not llm:
+ raise ValueError("LLM Initialization failed. Please check Agent Settings.")
+
+ # Browser Config (from browser_settings tab)
+ # Note: DeepResearchAgent constructor takes a dict, not full Browser/Context objects
+ browser_config_dict = {
+ "headless": get_setting("browser_settings", "headless", False),
+ "disable_security": get_setting("browser_settings", "disable_security", False),
+ "browser_binary_path": get_setting("browser_settings", "browser_binary_path"),
+ "user_data_dir": get_setting("browser_settings", "browser_user_data_dir"),
+ "window_width": int(get_setting("browser_settings", "window_w", 1280)),
+ "window_height": int(get_setting("browser_settings", "window_h", 1100)),
+ # Add other relevant fields if DeepResearchAgent accepts them
+ }
+
+ # --- 4. Initialize or Get Agent ---
+ if not webui_manager.dr_agent:
+ webui_manager.dr_agent = DeepResearchAgent(
+ llm=llm,
+ browser_config=browser_config_dict,
+ mcp_server_config=mcp_config
+ )
+ logger.info("DeepResearchAgent initialized.")
+
+ # --- 5. Start Agent Run ---
+ agent_run_coro = webui_manager.dr_agent.run(
+ topic=task_topic,
+ task_id=task_id_to_resume,
+ save_dir=base_save_dir,
+ max_parallel_browsers=max_parallel_agents
+ )
+ agent_task = asyncio.create_task(agent_run_coro)
+ webui_manager.dr_current_task = agent_task
+
+ # Wait briefly for the agent to start and potentially create the task ID/folder
+ await asyncio.sleep(1.0)
+
+ # Determine the actual task ID being used (agent sets this)
+ running_task_id = webui_manager.dr_agent.current_task_id
+ if not running_task_id:
+ # Agent might not have set it yet, try to get from result later? Risky.
+ # Or derive from resume_task_id if provided?
+ running_task_id = task_id_to_resume
+ if not running_task_id:
+ logger.warning("Could not determine running task ID immediately.")
+ # We can still monitor, but might miss initial plan if ID needed for path
+ else:
+ logger.info(f"Assuming task ID based on resume ID: {running_task_id}")
+ else:
+ logger.info(f"Agent started with Task ID: {running_task_id}")
+
+ webui_manager.dr_task_id = running_task_id # Store for stop handler
+
+ # --- 6. Monitor Progress via research_plan.md ---
+ if running_task_id:
+ task_specific_dir = os.path.join(base_save_dir, str(running_task_id))
+ plan_file_path = os.path.join(task_specific_dir, "research_plan.md")
+ report_file_path = os.path.join(task_specific_dir, "report.md")
+ logger.info(f"Monitoring plan file: {plan_file_path}")
+ else:
+ logger.warning("Cannot monitor plan file: Task ID unknown.")
+ plan_file_path = None
+ last_plan_content = None
+ while not agent_task.done():
+ update_dict = {}
+ update_dict[resume_task_id_comp] = gr.update(value=running_task_id)
+ agent_stopped = getattr(webui_manager.dr_agent, 'stopped', False)
+ if agent_stopped:
+ logger.info("Stop signal detected from agent state.")
+ break # Exit monitoring loop
+
+ # Check and update research plan display
+ if plan_file_path:
+ try:
+ current_mtime = os.path.getmtime(plan_file_path) if os.path.exists(plan_file_path) else 0
+ if current_mtime > last_plan_mtime:
+ logger.info(f"Detected change in {plan_file_path}")
+ plan_content = _read_file_safe(plan_file_path)
+ if last_plan_content is None or (
+ plan_content is not None and plan_content != last_plan_content):
+ update_dict[markdown_display_comp] = gr.update(value=plan_content)
+ last_plan_content = plan_content
+ last_plan_mtime = current_mtime
+ elif plan_content is None:
+ # File might have been deleted or became unreadable
+ last_plan_mtime = 0 # Reset to force re-read attempt later
+ except Exception as e:
+ logger.warning(f"Error checking/reading plan file {plan_file_path}: {e}")
+ # Avoid continuous logging for the same error
+ await asyncio.sleep(2.0)
+
+ # Yield updates if any
+ if update_dict:
+ yield update_dict
+
+ await asyncio.sleep(1.0) # Check file changes every second
+
+ # --- 7. Task Finalization ---
+ logger.info("Agent task processing finished. Awaiting final result...")
+ final_result_dict = await agent_task # Get result or raise exception
+ logger.info(f"Agent run completed. Result keys: {final_result_dict.keys() if final_result_dict else 'None'}")
+
+ # Try to get task ID from result if not known before
+ if not running_task_id and final_result_dict and 'task_id' in final_result_dict:
+ running_task_id = final_result_dict['task_id']
+ webui_manager.dr_task_id = running_task_id
+ task_specific_dir = os.path.join(base_save_dir, str(running_task_id))
+ report_file_path = os.path.join(task_specific_dir, "report.md")
+ logger.info(f"Task ID confirmed from result: {running_task_id}")
+
+ final_ui_update = {}
+ if report_file_path and os.path.exists(report_file_path):
+ logger.info(f"Loading final report from: {report_file_path}")
+ report_content = _read_file_safe(report_file_path)
+ if report_content:
+ final_ui_update[markdown_display_comp] = gr.update(value=report_content)
+ final_ui_update[markdown_download_comp] = gr.File(value=report_file_path,
+ label=f"Report ({running_task_id}.md)",
+ interactive=True)
+ else:
+ final_ui_update[markdown_display_comp] = gr.update(
+ value="# Research Complete\n\n*Error reading final report file.*")
+ elif final_result_dict and 'report' in final_result_dict:
+ logger.info("Using report content directly from agent result.")
+ # If agent directly returns report content
+ final_ui_update[markdown_display_comp] = gr.update(value=final_result_dict['report'])
+ # Cannot offer download if only content is available
+ final_ui_update[markdown_download_comp] = gr.update(value=None, label="Download Research Report",
+ interactive=False)
+ else:
+ logger.warning("Final report file not found and not in result dict.")
+ final_ui_update[markdown_display_comp] = gr.update(value="# Research Complete\n\n*Final report not found.*")
+
+ yield final_ui_update
+
+
+ except Exception as e:
+ logger.error(f"Error during Deep Research Agent execution: {e}", exc_info=True)
+ gr.Error(f"Research failed: {e}")
+ yield {markdown_display_comp: gr.update(value=f"# Research Failed\n\n**Error:**\n```\n{e}\n```")}
+
+ finally:
+ # --- 8. Final UI Reset ---
+ webui_manager.dr_current_task = None # Clear task reference
+ webui_manager.dr_task_id = None # Clear running task ID
+
+ yield {
+ start_button_comp: gr.update(value="ā¶ļø Run", interactive=True),
+ stop_button_comp: gr.update(interactive=False),
+ research_task_comp: gr.update(interactive=True),
+ resume_task_id_comp: gr.update(value="", interactive=True),
+ parallel_num_comp: gr.update(interactive=True),
+ save_dir_comp: gr.update(interactive=True),
+ # Keep download button enabled if file exists
+ markdown_download_comp: gr.update() if report_file_path and os.path.exists(report_file_path) else gr.update(
+ interactive=False)
+ }
+
+
+async def stop_deep_research(webui_manager: WebuiManager) -> Dict[Component, Any]:
+ """Handles the Stop button click."""
+ logger.info("Stop button clicked for Deep Research.")
+ agent = webui_manager.dr_agent
+ task = webui_manager.dr_current_task
+ task_id = webui_manager.dr_task_id
+ base_save_dir = webui_manager.dr_save_dir
+
+ stop_button_comp = webui_manager.get_component_by_id("deep_research_agent.stop_button")
+ start_button_comp = webui_manager.get_component_by_id("deep_research_agent.start_button")
+ markdown_display_comp = webui_manager.get_component_by_id("deep_research_agent.markdown_display")
+ markdown_download_comp = webui_manager.get_component_by_id("deep_research_agent.markdown_download")
+
+ final_update = {
+ stop_button_comp: gr.update(interactive=False, value="ā¹ļø Stopping...")
+ }
+
+ if agent and task and not task.done():
+ logger.info("Signalling DeepResearchAgent to stop.")
+ try:
+ # Assuming stop is synchronous or sets a flag quickly
+ await agent.stop()
+ except Exception as e:
+ logger.error(f"Error calling agent.stop(): {e}")
+
+ # The run_deep_research loop should detect the stop and exit.
+ # We yield an intermediate "Stopping..." state. The final reset is done by run_deep_research.
+
+ # Try to show the final report if available after stopping
+ await asyncio.sleep(1.5) # Give agent a moment to write final files potentially
+ report_file_path = None
+ if task_id and base_save_dir:
+ report_file_path = os.path.join(base_save_dir, str(task_id), "report.md")
+
+ if report_file_path and os.path.exists(report_file_path):
+ report_content = _read_file_safe(report_file_path)
+ if report_content:
+ final_update[markdown_display_comp] = gr.update(
+ value=report_content + "\n\n---\n*Research stopped by user.*")
+ final_update[markdown_download_comp] = gr.File(value=report_file_path, label=f"Report ({task_id}.md)",
+ interactive=True)
+ else:
+ final_update[markdown_display_comp] = gr.update(
+ value="# Research Stopped\n\n*Error reading final report file after stop.*")
+ else:
+ final_update[markdown_display_comp] = gr.update(value="# Research Stopped by User")
+
+ # Keep start button disabled, run_deep_research finally block will re-enable it.
+ final_update[start_button_comp] = gr.update(interactive=False)
+
+ else:
+ logger.warning("Stop clicked but no active research task found.")
+ # Reset UI state just in case
+ final_update = {
+ start_button_comp: gr.update(interactive=True),
+ stop_button_comp: gr.update(interactive=False),
+ webui_manager.get_component_by_id("deep_research_agent.research_task"): gr.update(interactive=True),
+ webui_manager.get_component_by_id("deep_research_agent.resume_task_id"): gr.update(interactive=True),
+ webui_manager.get_component_by_id("deep_research_agent.max_iteration"): gr.update(interactive=True),
+ webui_manager.get_component_by_id("deep_research_agent.max_query"): gr.update(interactive=True),
+ }
+
+ return final_update
+
+
+async def update_mcp_server(mcp_file: str, webui_manager: WebuiManager):
+ """
+ Update the MCP server.
+ """
+ if hasattr(webui_manager, "dr_agent") and webui_manager.dr_agent:
+ logger.warning("ā ļø Close controller because mcp file has changed!")
+ await webui_manager.dr_agent.close_mcp_client()
+
+ if not mcp_file or not os.path.exists(mcp_file) or not mcp_file.endswith('.json'):
+ logger.warning(f"{mcp_file} is not a valid MCP file.")
+ return None, gr.update(visible=False)
+
+ with open(mcp_file, 'r') as f:
+ mcp_server = json.load(f)
+
+ return json.dumps(mcp_server, indent=2), gr.update(visible=True)
+
+
+def create_deep_research_agent_tab(webui_manager: WebuiManager):
+ """
+ Creates a deep research agent tab
+ """
+ input_components = set(webui_manager.get_components())
+ tab_components = {}
+
+ with gr.Group():
+ with gr.Row():
+ mcp_json_file = gr.File(label="MCP server json", interactive=True, file_types=[".json"])
+ mcp_server_config = gr.Textbox(label="MCP server", lines=6, interactive=True, visible=False)
+
+ with gr.Group():
+ research_task = gr.Textbox(label="Research Task", lines=5,
+ value="Give me a detailed travel plan to Switzerland from June 1st to 10th.",
+ interactive=True)
+ with gr.Row():
+ resume_task_id = gr.Textbox(label="Resume Task ID", value="",
+ interactive=True)
+ parallel_num = gr.Number(label="Parallel Agent Num", value=1,
+ precision=0,
+ interactive=True)
+ max_query = gr.Textbox(label="Research Save Dir", value="./tmp/deep_research",
+ interactive=True)
+ with gr.Row():
+ stop_button = gr.Button("ā¹ļø Stop", variant="stop", scale=2)
+ start_button = gr.Button("ā¶ļø Run", variant="primary", scale=3)
+ with gr.Group():
+ markdown_display = gr.Markdown(label="Research Report")
+ markdown_download = gr.File(label="Download Research Report", interactive=False)
+ tab_components.update(
+ dict(
+ research_task=research_task,
+ parallel_num=parallel_num,
+ max_query=max_query,
+ start_button=start_button,
+ stop_button=stop_button,
+ markdown_display=markdown_display,
+ markdown_download=markdown_download,
+ resume_task_id=resume_task_id,
+ mcp_json_file=mcp_json_file,
+ mcp_server_config=mcp_server_config,
+ )
+ )
+ webui_manager.add_components("deep_research_agent", tab_components)
+ webui_manager.init_deep_research_agent()
+
+ async def update_wrapper(mcp_file):
+ """Wrapper for handle_pause_resume."""
+ update_dict = await update_mcp_server(mcp_file, webui_manager)
+ yield update_dict
+
+ mcp_json_file.change(
+ update_wrapper,
+ inputs=[mcp_json_file],
+ outputs=[mcp_server_config, mcp_server_config]
+ )
+
+ dr_tab_outputs = list(tab_components.values())
+ all_managed_inputs = set(webui_manager.get_components())
+
+ # --- Define Event Handler Wrappers ---
+ async def start_wrapper(comps: Dict[Component, Any]) -> AsyncGenerator[Dict[Component, Any], None]:
+ async for update in run_deep_research(webui_manager, comps):
+ yield update
+
+ async def stop_wrapper() -> AsyncGenerator[Dict[Component, Any], None]:
+ update_dict = await stop_deep_research(webui_manager)
+ yield update_dict
+
+ # --- Connect Handlers ---
+ start_button.click(
+ fn=start_wrapper,
+ inputs=all_managed_inputs,
+ outputs=dr_tab_outputs
+ )
+
+ stop_button.click(
+ fn=stop_wrapper,
+ inputs=None,
+ outputs=dr_tab_outputs
+ )
diff --git a/src/webui/components/load_save_config_tab.py b/src/webui/components/load_save_config_tab.py
new file mode 100644
index 0000000..aaa1441
--- /dev/null
+++ b/src/webui/components/load_save_config_tab.py
@@ -0,0 +1,50 @@
+import gradio as gr
+from gradio.components import Component
+
+from src.webui.webui_manager import WebuiManager
+from src.utils import config
+
+
+def create_load_save_config_tab(webui_manager: WebuiManager):
+ """
+ Creates a load and save config tab.
+ """
+ input_components = set(webui_manager.get_components())
+ tab_components = {}
+
+ config_file = gr.File(
+ label="Load UI Settings from json",
+ file_types=[".json"],
+ interactive=True
+ )
+ with gr.Row():
+ load_config_button = gr.Button("Load Config", variant="primary")
+ save_config_button = gr.Button("Save UI Settings", variant="primary")
+
+ config_status = gr.Textbox(
+ label="Status",
+ lines=2,
+ interactive=False
+ )
+
+ tab_components.update(dict(
+ load_config_button=load_config_button,
+ save_config_button=save_config_button,
+ config_status=config_status,
+ config_file=config_file,
+ ))
+
+ webui_manager.add_components("load_save_config", tab_components)
+
+ save_config_button.click(
+ fn=webui_manager.save_config,
+ inputs=set(webui_manager.get_components()),
+ outputs=[config_status]
+ )
+
+ load_config_button.click(
+ fn=webui_manager.load_config,
+ inputs=[config_file],
+ outputs=webui_manager.get_components(),
+ )
+
diff --git a/src/webui/interface.py b/src/webui/interface.py
new file mode 100644
index 0000000..083649e
--- /dev/null
+++ b/src/webui/interface.py
@@ -0,0 +1,95 @@
+import gradio as gr
+
+from src.webui.webui_manager import WebuiManager
+from src.webui.components.agent_settings_tab import create_agent_settings_tab
+from src.webui.components.browser_settings_tab import create_browser_settings_tab
+from src.webui.components.browser_use_agent_tab import create_browser_use_agent_tab
+from src.webui.components.deep_research_agent_tab import create_deep_research_agent_tab
+from src.webui.components.load_save_config_tab import create_load_save_config_tab
+
+theme_map = {
+ "Default": gr.themes.Default(),
+ "Soft": gr.themes.Soft(),
+ "Monochrome": gr.themes.Monochrome(),
+ "Glass": gr.themes.Glass(),
+ "Origin": gr.themes.Origin(),
+ "Citrus": gr.themes.Citrus(),
+ "Ocean": gr.themes.Ocean(),
+ "Base": gr.themes.Base()
+}
+
+
+def create_ui(theme_name="Ocean"):
+ css = """
+ .gradio-container {
+ width: 70vw !important;
+ max-width: 70% !important;
+ margin-left: auto !important;
+ margin-right: auto !important;
+ padding-top: 10px !important;
+ }
+ .header-text {
+ text-align: center;
+ margin-bottom: 20px;
+ }
+ .tab-header-text {
+ text-align: center;
+ }
+ .theme-section {
+ margin-bottom: 10px;
+ padding: 15px;
+ border-radius: 10px;
+ }
+ """
+
+ # dark mode in default
+ js_func = """
+ function refresh() {
+ const url = new URL(window.location);
+
+ if (url.searchParams.get('__theme') !== 'dark') {
+ url.searchParams.set('__theme', 'dark');
+ window.location.href = url.href;
+ }
+ }
+ """
+
+ ui_manager = WebuiManager()
+
+ with gr.Blocks(
+ title="Browser Use WebUI", theme=theme_map[theme_name], css=css, js=js_func,
+ ) as demo:
+ with gr.Row():
+ gr.Markdown(
+ """
+ # š Browser Use WebUI
+ ### Control your browser with AI assistance
+ """,
+ elem_classes=["header-text"],
+ )
+
+ with gr.Tabs() as tabs:
+ with gr.TabItem("āļø Agent Settings"):
+ create_agent_settings_tab(ui_manager)
+
+ with gr.TabItem("š Browser Settings"):
+ create_browser_settings_tab(ui_manager)
+
+ with gr.TabItem("š¤ Run Agent"):
+ create_browser_use_agent_tab(ui_manager)
+
+ with gr.TabItem("š Agent Marketplace"):
+ gr.Markdown(
+ """
+ ### Agents built on Browser-Use
+ """,
+ elem_classes=["tab-header-text"],
+ )
+ with gr.Tabs():
+ with gr.TabItem("Deep Research"):
+ create_deep_research_agent_tab(ui_manager)
+
+ with gr.TabItem("š Load & Save Config"):
+ create_load_save_config_tab(ui_manager)
+
+ return demo
diff --git a/src/webui/webui_manager.py b/src/webui/webui_manager.py
new file mode 100644
index 0000000..542d387
--- /dev/null
+++ b/src/webui/webui_manager.py
@@ -0,0 +1,118 @@
+import json
+from collections.abc import Generator
+from typing import TYPE_CHECKING
+import os
+import gradio as gr
+from datetime import datetime
+from typing import Optional, Dict, List
+import uuid
+import asyncio
+
+from gradio.components import Component
+from browser_use.browser.browser import Browser
+from browser_use.browser.context import BrowserContext
+from browser_use.agent.service import Agent
+from src.browser.custom_browser import CustomBrowser
+from src.browser.custom_context import CustomBrowserContext
+from src.controller.custom_controller import CustomController
+from src.agent.deep_research.deep_research_agent import DeepResearchAgent
+
+
+class WebuiManager:
+ def __init__(self, settings_save_dir: str = "./tmp/webui_settings"):
+ self.id_to_component: dict[str, Component] = {}
+ self.component_to_id: dict[Component, str] = {}
+
+ self.settings_save_dir = settings_save_dir
+ os.makedirs(self.settings_save_dir, exist_ok=True)
+
+ def init_browser_use_agent(self) -> None:
+ """
+ init browser use agent
+ """
+ self.bu_agent: Optional[Agent] = None
+ self.bu_browser: Optional[CustomBrowser] = None
+ self.bu_browser_context: Optional[CustomBrowserContext] = None
+ self.bu_controller: Optional[CustomController] = None
+ self.bu_chat_history: List[Dict[str, Optional[str]]] = []
+ self.bu_response_event: Optional[asyncio.Event] = None
+ self.bu_user_help_response: Optional[str] = None
+ self.bu_current_task: Optional[asyncio.Task] = None
+ self.bu_agent_task_id: Optional[str] = None
+
+ def init_deep_research_agent(self) -> None:
+ """
+ init deep research agent
+ """
+ self.dr_agent: Optional[DeepResearchAgent] = None
+ self.dr_current_task = None
+ self.dr_agent_task_id: Optional[str] = None
+ self.dr_save_dir: Optional[str] = None
+
+ def add_components(self, tab_name: str, components_dict: dict[str, "Component"]) -> None:
+ """
+ Add tab components
+ """
+ for comp_name, component in components_dict.items():
+ comp_id = f"{tab_name}.{comp_name}"
+ self.id_to_component[comp_id] = component
+ self.component_to_id[component] = comp_id
+
+ def get_components(self) -> list["Component"]:
+ """
+ Get all components
+ """
+ return list(self.id_to_component.values())
+
+ def get_component_by_id(self, comp_id: str) -> "Component":
+ """
+ Get component by id
+ """
+ return self.id_to_component[comp_id]
+
+ def get_id_by_component(self, comp: "Component") -> str:
+ """
+ Get id by component
+ """
+ return self.component_to_id[comp]
+
+ def save_config(self, components: Dict["Component", str]) -> None:
+ """
+ Save config
+ """
+ cur_settings = {}
+ for comp in components:
+ if not isinstance(comp, gr.Button) and not isinstance(comp, gr.File) and str(
+ getattr(comp, "interactive", True)).lower() != "false":
+ comp_id = self.get_id_by_component(comp)
+ cur_settings[comp_id] = components[comp]
+
+ config_name = datetime.now().strftime("%Y%m%d-%H%M%S")
+ with open(os.path.join(self.settings_save_dir, f"{config_name}.json"), "w") as fw:
+ json.dump(cur_settings, fw, indent=4)
+
+ return os.path.join(self.settings_save_dir, f"{config_name}.json")
+
+ def load_config(self, config_path: str):
+ """
+ Load config
+ """
+ with open(config_path, "r") as fr:
+ ui_settings = json.load(fr)
+
+ update_components = {}
+ for comp_id, comp_val in ui_settings.items():
+ if comp_id in self.id_to_component:
+ comp = self.id_to_component[comp_id]
+ if comp.__class__.__name__ == "Chatbot":
+ update_components[comp] = comp.__class__(value=comp_val, type="messages")
+ else:
+ update_components[comp] = comp.__class__(value=comp_val)
+
+ config_status = self.id_to_component["load_save_config.config_status"]
+ update_components.update(
+ {
+ config_status: config_status.__class__(value=f"Successfully loaded config: {config_path}")
+ }
+ )
+ yield update_components
diff --git a/supervisord.conf b/supervisord.conf
index 3410b91..6010766 100644
--- a/supervisord.conf
+++ b/supervisord.conf
@@ -3,7 +3,7 @@ user=root
nodaemon=true
logfile=/dev/stdout
logfile_maxbytes=0
-loglevel=debug
+loglevel=error
[program:xvfb]
command=Xvfb :99 -screen 0 %(ENV_RESOLUTION)s -ac +extension GLX +render -noreset
@@ -65,21 +65,6 @@ startretries=5
startsecs=3
depends_on=x11vnc
-[program:persistent_browser]
-environment=START_URL="data:text/html,Browser Ready
"
-command=bash -c "mkdir -p /app/data/chrome_data && sleep 8 && $(find /ms-playwright/chromium-*/chrome-linux -name chrome) --user-data-dir=/app/data/chrome_data --window-position=0,0 --window-size=%(ENV_RESOLUTION_WIDTH)s,%(ENV_RESOLUTION_HEIGHT)s --start-maximized --no-sandbox --disable-dev-shm-usage --disable-gpu --disable-software-rasterizer --disable-setuid-sandbox --no-first-run --no-default-browser-check --no-experiments --ignore-certificate-errors --remote-debugging-port=9222 --remote-debugging-address=0.0.0.0 \"$START_URL\""
-autorestart=true
-stdout_logfile=/dev/stdout
-stdout_logfile_maxbytes=0
-stderr_logfile=/dev/stderr
-stderr_logfile_maxbytes=0
-priority=350
-startretries=5
-startsecs=10
-stopsignal=TERM
-stopwaitsecs=15
-depends_on=novnc
-
[program:webui]
command=python webui.py --ip 0.0.0.0 --port 7788
directory=/app
@@ -92,5 +77,4 @@ priority=400
startretries=3
startsecs=3
stopsignal=TERM
-stopwaitsecs=10
-depends_on=persistent_browser
+stopwaitsecs=10
\ No newline at end of file
diff --git a/tests/test_agents.py b/tests/test_agents.py
new file mode 100644
index 0000000..1285167
--- /dev/null
+++ b/tests/test_agents.py
@@ -0,0 +1,394 @@
+import pdb
+
+from dotenv import load_dotenv
+
+load_dotenv()
+import sys
+
+sys.path.append(".")
+import asyncio
+import os
+import sys
+from pprint import pprint
+
+from browser_use import Agent
+from browser_use.agent.views import AgentHistoryList
+
+from src.utils import utils
+
+
+async def test_browser_use_agent():
+ from browser_use.browser.browser import Browser, BrowserConfig
+ from browser_use.browser.context import (
+ BrowserContextConfig
+ )
+ from browser_use.agent.service import Agent
+
+ from src.browser.custom_browser import CustomBrowser
+ from src.controller.custom_controller import CustomController
+ from src.utils import llm_provider
+ from src.agent.browser_use.browser_use_agent import BrowserUseAgent
+
+ # llm = utils.get_llm_model(
+ # provider="openai",
+ # model_name="gpt-4o",
+ # temperature=0.8,
+ # base_url=os.getenv("OPENAI_ENDPOINT", ""),
+ # api_key=os.getenv("OPENAI_API_KEY", ""),
+ # )
+
+ llm = llm_provider.get_llm_model(
+ provider="google",
+ model_name="gemini-2.0-flash",
+ temperature=0.6,
+ api_key=os.getenv("GOOGLE_API_KEY", "")
+ )
+
+ # llm = utils.get_llm_model(
+ # provider="deepseek",
+ # model_name="deepseek-reasoner",
+ # temperature=0.8
+ # )
+
+ # llm = utils.get_llm_model(
+ # provider="deepseek",
+ # model_name="deepseek-chat",
+ # temperature=0.8
+ # )
+
+ # llm = utils.get_llm_model(
+ # provider="ollama", model_name="qwen2.5:7b", temperature=0.5
+ # )
+
+ # llm = utils.get_llm_model(
+ # provider="ollama", model_name="deepseek-r1:14b", temperature=0.5
+ # )
+
+ window_w, window_h = 1280, 1100
+
+ # llm = llm_provider.get_llm_model(
+ # provider="azure_openai",
+ # model_name="gpt-4o",
+ # temperature=0.5,
+ # base_url=os.getenv("AZURE_OPENAI_ENDPOINT", ""),
+ # api_key=os.getenv("AZURE_OPENAI_API_KEY", ""),
+ # )
+
+ mcp_server_config = {
+ "mcpServers": {
+ # "markitdown": {
+ # "command": "docker",
+ # "args": [
+ # "run",
+ # "--rm",
+ # "-i",
+ # "markitdown-mcp:latest"
+ # ]
+ # },
+ "desktop-commander": {
+ "command": "npx",
+ "args": [
+ "-y",
+ "@wonderwhy-er/desktop-commander"
+ ]
+ },
+ }
+ }
+ controller = CustomController()
+ await controller.setup_mcp_client(mcp_server_config)
+ use_own_browser = True
+ use_vision = True # Set to False when using DeepSeek
+
+ max_actions_per_step = 10
+ browser = None
+ browser_context = None
+
+ try:
+ extra_browser_args = [f"--window-size={window_w},{window_h}"]
+ if use_own_browser:
+ browser_binary_path = os.getenv("BROWSER_PATH", None)
+ if browser_binary_path == "":
+ browser_binary_path = None
+ browser_user_data = os.getenv("BROWSER_USER_DATA", None)
+ if browser_user_data:
+ extra_browser_args += [f"--user-data-dir={browser_user_data}"]
+ else:
+ browser_binary_path = None
+ browser = CustomBrowser(
+ config=BrowserConfig(
+ headless=False,
+ browser_binary_path=browser_binary_path,
+ extra_browser_args=extra_browser_args,
+ )
+ )
+ browser_context = await browser.new_context(
+ config=BrowserContextConfig(
+ trace_path=None,
+ save_recording_path=None,
+ save_downloads_path="./tmp/downloads",
+ window_height=window_h,
+ window_width=window_w,
+ )
+ )
+ agent = BrowserUseAgent(
+ # task="download pdf from https://arxiv.org/pdf/2311.16498 and rename this pdf to 'mcp-test.pdf'",
+ task="give me nvidia stock price",
+ llm=llm,
+ browser=browser,
+ browser_context=browser_context,
+ controller=controller,
+ use_vision=use_vision,
+ max_actions_per_step=max_actions_per_step,
+ generate_gif=True
+ )
+ history: AgentHistoryList = await agent.run(max_steps=100)
+
+ print("Final Result:")
+ pprint(history.final_result(), indent=4)
+
+ print("\nErrors:")
+ pprint(history.errors(), indent=4)
+
+ except Exception:
+ import traceback
+ traceback.print_exc()
+ finally:
+ if browser_context:
+ await browser_context.close()
+ if browser:
+ await browser.close()
+ if controller:
+ await controller.close_mcp_client()
+
+
+async def test_browser_use_parallel():
+ from browser_use.browser.browser import Browser, BrowserConfig
+ from browser_use.browser.context import (
+ BrowserContextConfig,
+ )
+ from browser_use.agent.service import Agent
+
+ from src.browser.custom_browser import CustomBrowser
+ from src.controller.custom_controller import CustomController
+ from src.utils import llm_provider
+ from src.agent.browser_use.browser_use_agent import BrowserUseAgent
+
+ # llm = utils.get_llm_model(
+ # provider="openai",
+ # model_name="gpt-4o",
+ # temperature=0.8,
+ # base_url=os.getenv("OPENAI_ENDPOINT", ""),
+ # api_key=os.getenv("OPENAI_API_KEY", ""),
+ # )
+
+ # llm = utils.get_llm_model(
+ # provider="google",
+ # model_name="gemini-2.0-flash",
+ # temperature=0.6,
+ # api_key=os.getenv("GOOGLE_API_KEY", "")
+ # )
+
+ # llm = utils.get_llm_model(
+ # provider="deepseek",
+ # model_name="deepseek-reasoner",
+ # temperature=0.8
+ # )
+
+ # llm = utils.get_llm_model(
+ # provider="deepseek",
+ # model_name="deepseek-chat",
+ # temperature=0.8
+ # )
+
+ # llm = utils.get_llm_model(
+ # provider="ollama", model_name="qwen2.5:7b", temperature=0.5
+ # )
+
+ # llm = utils.get_llm_model(
+ # provider="ollama", model_name="deepseek-r1:14b", temperature=0.5
+ # )
+
+ window_w, window_h = 1280, 1100
+
+ llm = llm_provider.get_llm_model(
+ provider="azure_openai",
+ model_name="gpt-4o",
+ temperature=0.5,
+ base_url=os.getenv("AZURE_OPENAI_ENDPOINT", ""),
+ api_key=os.getenv("AZURE_OPENAI_API_KEY", ""),
+ )
+
+ mcp_server_config = {
+ "mcpServers": {
+ # "markitdown": {
+ # "command": "docker",
+ # "args": [
+ # "run",
+ # "--rm",
+ # "-i",
+ # "markitdown-mcp:latest"
+ # ]
+ # },
+ "desktop-commander": {
+ "command": "npx",
+ "args": [
+ "-y",
+ "@wonderwhy-er/desktop-commander"
+ ]
+ },
+ # "filesystem": {
+ # "command": "npx",
+ # "args": [
+ # "-y",
+ # "@modelcontextprotocol/server-filesystem",
+ # "/Users/xxx/ai_workspace",
+ # ]
+ # },
+ }
+ }
+ controller = CustomController()
+ await controller.setup_mcp_client(mcp_server_config)
+ use_own_browser = True
+ use_vision = True # Set to False when using DeepSeek
+
+ max_actions_per_step = 10
+ browser = None
+ browser_context = None
+
+ try:
+ extra_browser_args = [f"--window-size={window_w},{window_h}"]
+ if use_own_browser:
+ browser_binary_path = os.getenv("BROWSER_PATH", None)
+ if browser_binary_path == "":
+ browser_binary_path = None
+ browser_user_data = os.getenv("BROWSER_USER_DATA", None)
+ if browser_user_data:
+ extra_browser_args += [f"--user-data-dir={browser_user_data}"]
+ else:
+ browser_binary_path = None
+ browser = CustomBrowser(
+ config=BrowserConfig(
+ headless=False,
+ browser_binary_path=browser_binary_path,
+ extra_browser_args=extra_browser_args,
+ )
+ )
+ browser_context = await browser.new_context(
+ config=BrowserContextConfig(
+ trace_path=None,
+ save_recording_path=None,
+ save_downloads_path="./tmp/downloads",
+ window_height=window_h,
+ window_width=window_w,
+ force_new_context=True
+ )
+ )
+ agents = [
+ BrowserUseAgent(task=task, llm=llm, browser=browser, controller=controller)
+ for task in [
+ 'Search Google for weather in Tokyo',
+ # 'Check Reddit front page title',
+ # 'Find NASA image of the day',
+ # 'Check top story on CNN',
+ # 'Search latest SpaceX launch date',
+ # 'Look up population of Paris',
+ 'Find current time in Sydney',
+ 'Check who won last Super Bowl',
+ # 'Search trending topics on Twitter',
+ ]
+ ]
+
+ history = await asyncio.gather(*[agent.run() for agent in agents])
+ print("Final Result:")
+ pprint(history.final_result(), indent=4)
+
+ print("\nErrors:")
+ pprint(history.errors(), indent=4)
+
+ pdb.set_trace()
+
+ except Exception:
+ import traceback
+
+ traceback.print_exc()
+ finally:
+ if browser_context:
+ await browser_context.close()
+ if browser:
+ await browser.close()
+ if controller:
+ await controller.close_mcp_client()
+
+
+async def test_deep_research_agent():
+ from src.agent.deep_research.deep_research_agent import DeepResearchAgent, PLAN_FILENAME, REPORT_FILENAME
+ from src.utils import llm_provider
+
+ llm = llm_provider.get_llm_model(
+ provider="openai",
+ model_name="gpt-4o",
+ temperature=0.5
+ )
+
+ # llm = llm_provider.get_llm_model(
+ # provider="bedrock",
+ # )
+
+ mcp_server_config = {
+ "mcpServers": {
+ "desktop-commander": {
+ "command": "npx",
+ "args": [
+ "-y",
+ "@wonderwhy-er/desktop-commander"
+ ]
+ },
+ }
+ }
+
+ browser_config = {"headless": False, "window_width": 1280, "window_height": 1100, "use_own_browser": False}
+ agent = DeepResearchAgent(llm=llm, browser_config=browser_config, mcp_server_config=mcp_server_config)
+ research_topic = "Give me investment advices of nvidia and tesla."
+ task_id_to_resume = "" # Set this to resume a previous task ID
+
+ print(f"Starting research on: {research_topic}")
+
+ try:
+ # Call run and wait for the final result dictionary
+ result = await agent.run(research_topic,
+ task_id=task_id_to_resume,
+ save_dir="./tmp/deep_research",
+ max_parallel_browsers=1,
+ )
+
+ print("\n--- Research Process Ended ---")
+ print(f"Status: {result.get('status')}")
+ print(f"Message: {result.get('message')}")
+ print(f"Task ID: {result.get('task_id')}")
+
+ # Check the final state for the report
+ final_state = result.get('final_state', {})
+ if final_state:
+ print("\n--- Final State Summary ---")
+ print(
+ f" Plan Steps Completed: {sum(1 for item in final_state.get('research_plan', []) if item.get('status') == 'completed')}")
+ print(f" Total Search Results Logged: {len(final_state.get('search_results', []))}")
+ if final_state.get("final_report"):
+ print(" Final Report: Generated (content omitted). You can find it in the output directory.")
+ # print("\n--- Final Report ---") # Optionally print report
+ # print(final_state["final_report"])
+ else:
+ print(" Final Report: Not generated.")
+ else:
+ print("Final state information not available.")
+
+
+ except Exception as e:
+ print(f"\n--- An unhandled error occurred outside the agent run ---")
+ print(e)
+
+
+if __name__ == "__main__":
+ asyncio.run(test_browser_use_agent())
+ # asyncio.run(test_browser_use_parallel())
+ # asyncio.run(test_deep_research_agent())
diff --git a/tests/test_browser_use.py b/tests/test_browser_use.py
deleted file mode 100644
index 6ef4210..0000000
--- a/tests/test_browser_use.py
+++ /dev/null
@@ -1,364 +0,0 @@
-import pdb
-
-from dotenv import load_dotenv
-
-load_dotenv()
-import sys
-
-sys.path.append(".")
-import asyncio
-import os
-import sys
-from pprint import pprint
-
-from browser_use import Agent
-from browser_use.agent.views import AgentHistoryList
-
-from src.utils import utils
-
-
-async def test_browser_use_org():
- from browser_use.browser.browser import Browser, BrowserConfig
- from browser_use.browser.context import (
- BrowserContextConfig,
- BrowserContextWindowSize,
- )
-
- # llm = utils.get_llm_model(
- # provider="azure_openai",
- # model_name="gpt-4o",
- # temperature=0.8,
- # base_url=os.getenv("AZURE_OPENAI_ENDPOINT", ""),
- # api_key=os.getenv("AZURE_OPENAI_API_KEY", ""),
- # )
-
- # llm = utils.get_llm_model(
- # provider="deepseek",
- # model_name="deepseek-chat",
- # temperature=0.8
- # )
-
- llm = utils.get_llm_model(
- provider="ollama", model_name="deepseek-r1:14b", temperature=0.5
- )
-
- window_w, window_h = 1920, 1080
- use_vision = False
- use_own_browser = False
- if use_own_browser:
- chrome_path = os.getenv("CHROME_PATH", None)
- if chrome_path == "":
- chrome_path = None
- else:
- chrome_path = None
-
- tool_calling_method = "json_schema" # setting to json_schema when using ollma
-
- browser = Browser(
- config=BrowserConfig(
- headless=False,
- disable_security=True,
- chrome_instance_path=chrome_path,
- extra_chromium_args=[f"--window-size={window_w},{window_h}"],
- )
- )
- async with await browser.new_context(
- config=BrowserContextConfig(
- trace_path="./tmp/traces",
- save_recording_path="./tmp/record_videos",
- no_viewport=False,
- browser_window_size=BrowserContextWindowSize(
- width=window_w, height=window_h
- ),
- )
- ) as browser_context:
- agent = Agent(
- task="go to google.com and type 'OpenAI' click search and give me the first url",
- llm=llm,
- browser_context=browser_context,
- use_vision=use_vision,
- tool_calling_method=tool_calling_method
- )
- history: AgentHistoryList = await agent.run(max_steps=10)
-
- print("Final Result:")
- pprint(history.final_result(), indent=4)
-
- print("\nErrors:")
- pprint(history.errors(), indent=4)
-
- # e.g. xPaths the model clicked on
- print("\nModel Outputs:")
- pprint(history.model_actions(), indent=4)
-
- print("\nThoughts:")
- pprint(history.model_thoughts(), indent=4)
- # close browser
- await browser.close()
-
-
-async def test_browser_use_custom():
- from browser_use.browser.context import BrowserContextWindowSize
- from browser_use.browser.browser import BrowserConfig
- from playwright.async_api import async_playwright
-
- from src.agent.custom_agent import CustomAgent
- from src.agent.custom_prompts import CustomSystemPrompt, CustomAgentMessagePrompt
- from src.browser.custom_browser import CustomBrowser
- from src.browser.custom_context import BrowserContextConfig
- from src.controller.custom_controller import CustomController
-
- window_w, window_h = 1280, 1100
-
- # llm = utils.get_llm_model(
- # provider="openai",
- # model_name="gpt-4o",
- # temperature=0.8,
- # base_url=os.getenv("OPENAI_ENDPOINT", ""),
- # api_key=os.getenv("OPENAI_API_KEY", ""),
- # )
-
- # llm = utils.get_llm_model(
- # provider="azure_openai",
- # model_name="gpt-4o",
- # temperature=0.6,
- # base_url=os.getenv("AZURE_OPENAI_ENDPOINT", ""),
- # api_key=os.getenv("AZURE_OPENAI_API_KEY", ""),
- # )
-
- llm = utils.get_llm_model(
- provider="google",
- model_name="gemini-2.0-flash",
- temperature=0.6,
- api_key=os.getenv("GOOGLE_API_KEY", "")
- )
-
- llm = utils.get_llm_model(
- provider="deepseek",
- model_name="deepseek-reasoner",
- temperature=0.8
- )
-
- # llm = utils.get_llm_model(
- # provider="deepseek",
- # model_name="deepseek-chat",
- # temperature=0.8
- # )
-
- # llm = utils.get_llm_model(
- # provider="ollama", model_name="qwen2.5:7b", temperature=0.5
- # )
-
- # llm = utils.get_llm_model(
- # provider="ollama", model_name="deepseek-r1:14b", temperature=0.5
- # )
-
- controller = CustomController()
- use_own_browser = True
- disable_security = True
- use_vision = False # Set to False when using DeepSeek
-
- max_actions_per_step = 1
- playwright = None
- browser = None
- browser_context = None
-
- try:
- extra_chromium_args = [f"--window-size={window_w},{window_h}"]
- if use_own_browser:
- chrome_path = os.getenv("CHROME_PATH", None)
- if chrome_path == "":
- chrome_path = None
- chrome_user_data = os.getenv("CHROME_USER_DATA", None)
- if chrome_user_data:
- extra_chromium_args += [f"--user-data-dir={chrome_user_data}"]
- else:
- chrome_path = None
- browser = CustomBrowser(
- config=BrowserConfig(
- headless=False,
- disable_security=disable_security,
- chrome_instance_path=chrome_path,
- extra_chromium_args=extra_chromium_args,
- )
- )
- browser_context = await browser.new_context(
- config=BrowserContextConfig(
- trace_path="./tmp/traces",
- save_recording_path="./tmp/record_videos",
- no_viewport=False,
- browser_window_size=BrowserContextWindowSize(
- width=window_w, height=window_h
- ),
- )
- )
- agent = CustomAgent(
- task="Give me stock price of Nvidia",
- add_infos="", # some hints for llm to complete the task
- llm=llm,
- browser=browser,
- browser_context=browser_context,
- controller=controller,
- system_prompt_class=CustomSystemPrompt,
- agent_prompt_class=CustomAgentMessagePrompt,
- use_vision=use_vision,
- max_actions_per_step=max_actions_per_step,
- generate_gif=True
- )
- history: AgentHistoryList = await agent.run(max_steps=100)
-
- print("Final Result:")
- pprint(history.final_result(), indent=4)
-
- print("\nErrors:")
- pprint(history.errors(), indent=4)
-
- # e.g. xPaths the model clicked on
- print("\nModel Outputs:")
- pprint(history.model_actions(), indent=4)
-
- print("\nThoughts:")
- pprint(history.model_thoughts(), indent=4)
-
-
- except Exception:
- import traceback
-
- traceback.print_exc()
- finally:
- # ę¾å¼å
³éęä¹
åäøäøę
- if browser_context:
- await browser_context.close()
-
- # å
³é Playwright 对豔
- if playwright:
- await playwright.stop()
- if browser:
- await browser.close()
-
-
-async def test_browser_use_parallel():
- from browser_use.browser.context import BrowserContextWindowSize
- from browser_use.browser.browser import BrowserConfig
- from playwright.async_api import async_playwright
- from browser_use.browser.browser import Browser
- from src.agent.custom_agent import CustomAgent
- from src.agent.custom_prompts import CustomSystemPrompt, CustomAgentMessagePrompt
- from src.browser.custom_browser import CustomBrowser
- from src.browser.custom_context import BrowserContextConfig
- from src.controller.custom_controller import CustomController
-
- window_w, window_h = 1920, 1080
-
- # llm = utils.get_llm_model(
- # provider="openai",
- # model_name="gpt-4o",
- # temperature=0.8,
- # base_url=os.getenv("OPENAI_ENDPOINT", ""),
- # api_key=os.getenv("OPENAI_API_KEY", ""),
- # )
-
- # llm = utils.get_llm_model(
- # provider="azure_openai",
- # model_name="gpt-4o",
- # temperature=0.8,
- # base_url=os.getenv("AZURE_OPENAI_ENDPOINT", ""),
- # api_key=os.getenv("AZURE_OPENAI_API_KEY", ""),
- # )
-
- llm = utils.get_llm_model(
- provider="gemini",
- model_name="gemini-2.0-flash-exp",
- temperature=1.0,
- api_key=os.getenv("GOOGLE_API_KEY", "")
- )
-
- # llm = utils.get_llm_model(
- # provider="deepseek",
- # model_name="deepseek-reasoner",
- # temperature=0.8
- # )
-
- # llm = utils.get_llm_model(
- # provider="deepseek",
- # model_name="deepseek-chat",
- # temperature=0.8
- # )
-
- # llm = utils.get_llm_model(
- # provider="ollama", model_name="qwen2.5:7b", temperature=0.5
- # )
-
- # llm = utils.get_llm_model(
- # provider="ollama", model_name="deepseek-r1:14b", temperature=0.5
- # )
-
- controller = CustomController()
- use_own_browser = True
- disable_security = True
- use_vision = True # Set to False when using DeepSeek
-
- max_actions_per_step = 1
- playwright = None
- browser = None
- browser_context = None
-
- browser = Browser(
- config=BrowserConfig(
- disable_security=True,
- headless=False,
- new_context_config=BrowserContextConfig(save_recording_path='./tmp/recordings'),
- )
- )
-
- try:
- agents = [
- Agent(task=task, llm=llm, browser=browser)
- for task in [
- 'Search Google for weather in Tokyo',
- 'Check Reddit front page title',
- 'Find NASA image of the day',
- 'Check top story on CNN',
- # 'Search latest SpaceX launch date',
- # 'Look up population of Paris',
- # 'Find current time in Sydney',
- # 'Check who won last Super Bowl',
- # 'Search trending topics on Twitter',
- ]
- ]
-
- history = await asyncio.gather(*[agent.run() for agent in agents])
- pdb.set_trace()
- print("Final Result:")
- pprint(history.final_result(), indent=4)
-
- print("\nErrors:")
- pprint(history.errors(), indent=4)
-
- # e.g. xPaths the model clicked on
- print("\nModel Outputs:")
- pprint(history.model_actions(), indent=4)
-
- print("\nThoughts:")
- pprint(history.model_thoughts(), indent=4)
- # close browser
- except Exception:
- import traceback
-
- traceback.print_exc()
- finally:
- # ę¾å¼å
³éęä¹
åäøäøę
- if browser_context:
- await browser_context.close()
-
- # å
³é Playwright 对豔
- if playwright:
- await playwright.stop()
- if browser:
- await browser.close()
-
-
-if __name__ == "__main__":
- # asyncio.run(test_browser_use_org())
- # asyncio.run(test_browser_use_parallel())
- asyncio.run(test_browser_use_custom())
diff --git a/tests/test_controller.py b/tests/test_controller.py
new file mode 100644
index 0000000..173bae4
--- /dev/null
+++ b/tests/test_controller.py
@@ -0,0 +1,131 @@
+import asyncio
+import pdb
+import sys
+import time
+
+sys.path.append(".")
+
+from dotenv import load_dotenv
+
+load_dotenv()
+
+
+async def test_mcp_client():
+ from src.utils.mcp_client import setup_mcp_client_and_tools, create_tool_param_model
+
+ test_server_config = {
+ "mcpServers": {
+ # "markitdown": {
+ # "command": "docker",
+ # "args": [
+ # "run",
+ # "--rm",
+ # "-i",
+ # "markitdown-mcp:latest"
+ # ]
+ # },
+ "desktop-commander": {
+ "command": "npx",
+ "args": [
+ "-y",
+ "@wonderwhy-er/desktop-commander"
+ ]
+ },
+ # "filesystem": {
+ # "command": "npx",
+ # "args": [
+ # "-y",
+ # "@modelcontextprotocol/server-filesystem",
+ # "/Users/xxx/ai_workspace",
+ # ]
+ # },
+ }
+ }
+
+ mcp_tools, mcp_client = await setup_mcp_client_and_tools(test_server_config)
+
+ for tool in mcp_tools:
+ tool_param_model = create_tool_param_model(tool)
+ print(tool.name)
+ print(tool.description)
+ print(tool_param_model.model_json_schema())
+ pdb.set_trace()
+
+
+async def test_controller_with_mcp():
+ import os
+ from src.controller.custom_controller import CustomController
+ from browser_use.controller.registry.views import ActionModel
+
+ mcp_server_config = {
+ "mcpServers": {
+ # "markitdown": {
+ # "command": "docker",
+ # "args": [
+ # "run",
+ # "--rm",
+ # "-i",
+ # "markitdown-mcp:latest"
+ # ]
+ # },
+ "desktop-commander": {
+ "command": "npx",
+ "args": [
+ "-y",
+ "@wonderwhy-er/desktop-commander"
+ ]
+ },
+ # "filesystem": {
+ # "command": "npx",
+ # "args": [
+ # "-y",
+ # "@modelcontextprotocol/server-filesystem",
+ # "/Users/xxx/ai_workspace",
+ # ]
+ # },
+ }
+ }
+
+ controller = CustomController()
+ await controller.setup_mcp_client(mcp_server_config)
+ action_name = "mcp.desktop-commander.execute_command"
+ action_info = controller.registry.registry.actions[action_name]
+ param_model = action_info.param_model
+ print(param_model.model_json_schema())
+ params = {"command": f"python ./tmp/test.py"
+ }
+ validated_params = param_model(**params)
+ ActionModel_ = controller.registry.create_action_model()
+ # Create ActionModel instance with the validated parameters
+ action_model = ActionModel_(**{action_name: validated_params})
+ result = await controller.act(action_model)
+ result = result.extracted_content
+ print(result)
+ if result and "Command is still running. Use read_output to get more output." in result and "PID" in \
+ result.split("\n")[0]:
+ pid = int(result.split("\n")[0].split("PID")[-1].strip())
+ action_name = "mcp.desktop-commander.read_output"
+ action_info = controller.registry.registry.actions[action_name]
+ param_model = action_info.param_model
+ print(param_model.model_json_schema())
+ params = {"pid": pid}
+ validated_params = param_model(**params)
+ action_model = ActionModel_(**{action_name: validated_params})
+ output_result = ""
+ while True:
+ time.sleep(1)
+ result = await controller.act(action_model)
+ result = result.extracted_content
+ if result:
+ pdb.set_trace()
+ output_result = result
+ break
+ print(output_result)
+ pdb.set_trace()
+ await controller.close_mcp_client()
+ pdb.set_trace()
+
+
+if __name__ == '__main__':
+ # asyncio.run(test_mcp_client())
+ asyncio.run(test_controller_with_mcp())
diff --git a/tests/test_deep_research.py b/tests/test_deep_research.py
deleted file mode 100644
index 762345d..0000000
--- a/tests/test_deep_research.py
+++ /dev/null
@@ -1,30 +0,0 @@
-import asyncio
-import os
-from dotenv import load_dotenv
-
-load_dotenv()
-import sys
-
-sys.path.append(".")
-
-async def test_deep_research():
- from src.utils.deep_research import deep_research
- from src.utils import utils
-
- task = "write a report about DeepSeek-R1, get its pdf"
- llm = utils.get_llm_model(
- provider="gemini",
- model_name="gemini-2.0-flash-thinking-exp-01-21",
- temperature=1.0,
- api_key=os.getenv("GOOGLE_API_KEY", "")
- )
-
- report_content, report_file_path = await deep_research(task=task, llm=llm, agent_state=None,
- max_search_iterations=1,
- max_query_num=3,
- use_own_browser=False)
-
-
-
-if __name__ == "__main__":
- asyncio.run(test_deep_research())
\ No newline at end of file
diff --git a/tests/test_llm_api.py b/tests/test_llm_api.py
index 1eb45f4..938f825 100644
--- a/tests/test_llm_api.py
+++ b/tests/test_llm_api.py
@@ -12,6 +12,7 @@ import sys
sys.path.append(".")
+
@dataclass
class LLMConfig:
provider: str
@@ -20,6 +21,7 @@ class LLMConfig:
base_url: str = None
api_key: str = None
+
def create_message_content(text, image_path=None):
content = [{"type": "text", "text": text}]
image_format = "png" if image_path and image_path.endswith(".png") else "jpeg"
@@ -32,6 +34,7 @@ def create_message_content(text, image_path=None):
})
return content
+
def get_env_value(key, provider):
env_mappings = {
"openai": {"api_key": "OPENAI_API_KEY", "base_url": "OPENAI_ENDPOINT"},
@@ -40,20 +43,22 @@ def get_env_value(key, provider):
"deepseek": {"api_key": "DEEPSEEK_API_KEY", "base_url": "DEEPSEEK_ENDPOINT"},
"mistral": {"api_key": "MISTRAL_API_KEY", "base_url": "MISTRAL_ENDPOINT"},
"alibaba": {"api_key": "ALIBABA_API_KEY", "base_url": "ALIBABA_ENDPOINT"},
- "moonshot":{"api_key": "MOONSHOT_API_KEY", "base_url": "MOONSHOT_ENDPOINT"},
+ "moonshot": {"api_key": "MOONSHOT_API_KEY", "base_url": "MOONSHOT_ENDPOINT"},
+ "ibm": {"api_key": "IBM_API_KEY", "base_url": "IBM_ENDPOINT"}
}
if provider in env_mappings and key in env_mappings[provider]:
return os.getenv(env_mappings[provider][key], "")
return ""
+
def test_llm(config, query, image_path=None, system_message=None):
- from src.utils import utils
+ from src.utils import utils, llm_provider
# Special handling for Ollama-based models
if config.provider == "ollama":
if "deepseek-r1" in config.model_name:
- from src.utils.llm import DeepSeekR1ChatOllama
+ from src.utils.llm_provider import DeepSeekR1ChatOllama
llm = DeepSeekR1ChatOllama(model=config.model_name)
else:
llm = ChatOllama(model=config.model_name)
@@ -65,7 +70,7 @@ def test_llm(config, query, image_path=None, system_message=None):
return
# For other providers, use the standard configuration
- llm = utils.get_llm_model(
+ llm = llm_provider.get_llm_model(
provider=config.provider,
model_name=config.model_name,
temperature=config.temperature,
@@ -85,53 +90,70 @@ def test_llm(config, query, image_path=None, system_message=None):
print(ai_msg.reasoning_content)
print(ai_msg.content)
- if config.provider == "deepseek" and "deepseek-reasoner" in config.model_name:
- print(llm.model_name)
- pdb.set_trace()
-
def test_openai_model():
config = LLMConfig(provider="openai", model_name="gpt-4o")
test_llm(config, "Describe this image", "assets/examples/test.png")
+
def test_google_model():
# Enable your API key first if you haven't: https://ai.google.dev/palm_docs/oauth_quickstart
config = LLMConfig(provider="google", model_name="gemini-2.0-flash-exp")
test_llm(config, "Describe this image", "assets/examples/test.png")
+
def test_azure_openai_model():
config = LLMConfig(provider="azure_openai", model_name="gpt-4o")
test_llm(config, "Describe this image", "assets/examples/test.png")
+
def test_deepseek_model():
config = LLMConfig(provider="deepseek", model_name="deepseek-chat")
test_llm(config, "Who are you?")
+
def test_deepseek_r1_model():
config = LLMConfig(provider="deepseek", model_name="deepseek-reasoner")
test_llm(config, "Which is greater, 9.11 or 9.8?", system_message="You are a helpful AI assistant.")
+
def test_ollama_model():
config = LLMConfig(provider="ollama", model_name="qwen2.5:7b")
test_llm(config, "Sing a ballad of LangChain.")
+
def test_deepseek_r1_ollama_model():
config = LLMConfig(provider="ollama", model_name="deepseek-r1:14b")
test_llm(config, "How many 'r's are in the word 'strawberry'?")
+
def test_mistral_model():
config = LLMConfig(provider="mistral", model_name="pixtral-large-latest")
test_llm(config, "Describe this image", "assets/examples/test.png")
+
def test_moonshot_model():
config = LLMConfig(provider="moonshot", model_name="moonshot-v1-32k-vision-preview")
test_llm(config, "Describe this image", "assets/examples/test.png")
+
+def test_ibm_model():
+ config = LLMConfig(provider="ibm", model_name="meta-llama/llama-4-maverick-17b-128e-instruct-fp8")
+ test_llm(config, "Describe this image", "assets/examples/test.png")
+
+
+def test_qwen_model():
+ config = LLMConfig(provider="alibaba", model_name="qwen-vl-max")
+ test_llm(config, "How many 'r's are in the word 'strawberry'?")
+
+
if __name__ == "__main__":
# test_openai_model()
# test_google_model()
- # test_azure_openai_model()
- #test_deepseek_model()
+ test_azure_openai_model()
+ # test_deepseek_model()
# test_ollama_model()
- test_deepseek_r1_model()
+ # test_deepseek_r1_model()
# test_deepseek_r1_ollama_model()
# test_mistral_model()
+ # test_ibm_model()
+ # test_qwen_model()
diff --git a/tests/test_playwright.py b/tests/test_playwright.py
index 6704a02..5a522fd 100644
--- a/tests/test_playwright.py
+++ b/tests/test_playwright.py
@@ -6,7 +6,7 @@ load_dotenv()
def test_connect_browser():
import os
- from playwright.sync_api import sync_playwright
+ from patchright.sync_api import sync_playwright
chrome_exe = os.getenv("CHROME_PATH", "")
chrome_use_data = os.getenv("CHROME_USER_DATA", "")
diff --git a/webui.py b/webui.py
index bc68605..34e93ab 100644
--- a/webui.py
+++ b/webui.py
@@ -1,1199 +1,18 @@
-import pdb
-import logging
-
from dotenv import load_dotenv
-
load_dotenv()
-import os
-import glob
-import asyncio
import argparse
-import os
-
-logger = logging.getLogger(__name__)
-
-import gradio as gr
-import inspect
-from functools import wraps
-
-from browser_use.agent.service import Agent
-from playwright.async_api import async_playwright
-from browser_use.browser.browser import Browser, BrowserConfig
-from browser_use.browser.context import (
- BrowserContextConfig,
- BrowserContextWindowSize,
-)
-from langchain_ollama import ChatOllama
-from playwright.async_api import async_playwright
-from src.utils.agent_state import AgentState
-
-from src.utils import utils
-from src.agent.custom_agent import CustomAgent
-from src.browser.custom_browser import CustomBrowser
-from src.agent.custom_prompts import CustomSystemPrompt, CustomAgentMessagePrompt
-from src.browser.custom_context import BrowserContextConfig, CustomBrowserContext
-from src.controller.custom_controller import CustomController
-from gradio.themes import Citrus, Default, Glass, Monochrome, Ocean, Origin, Soft, Base
-from src.utils.utils import update_model_dropdown, get_latest_files, capture_screenshot, MissingAPIKeyError
-from src.utils import utils
-
-# Global variables for persistence
-_global_browser = None
-_global_browser_context = None
-_global_agent = None
-
-# Create the global agent state instance
-_global_agent_state = AgentState()
-
-# webui config
-webui_config_manager = utils.ConfigManager()
-
-
-def scan_and_register_components(blocks):
- """ę«ęäøäøŖ Blocks 对豔并注åå
¶äøēęęäŗ¤äŗå¼ē»ä»¶ļ¼ä½äøå
ę¬ęé®"""
- global webui_config_manager
-
- def traverse_blocks(block, prefix=""):
- registered = 0
-
- # å¤ē Blocks čŖčŗ«ēē»ä»¶
- if hasattr(block, "children"):
- for i, child in enumerate(block.children):
- if isinstance(child, gr.components.Component):
- # ęé¤ęé® (Button) ē»ä»¶
- if getattr(child, "interactive", False) and not isinstance(child, gr.Button):
- name = f"{prefix}component_{i}"
- if hasattr(child, "label") and child.label:
- # 使ēØę ē¾ä½äøŗåē§°ēäøéØå
- label = child.label
- name = f"{prefix}{label}"
- logger.debug(f"Registering component: {name}")
- webui_config_manager.register_component(name, child)
- registered += 1
- elif hasattr(child, "children"):
- # éå½å¤ēåµå„ē Blocks
- new_prefix = f"{prefix}block_{i}_"
- registered += traverse_blocks(child, new_prefix)
-
- return registered
-
- total = traverse_blocks(blocks)
- logger.info(f"Total registered components: {total}")
-
-
-def save_current_config():
- return webui_config_manager.save_current_config()
-
-
-def update_ui_from_config(config_file):
- return webui_config_manager.update_ui_from_config(config_file)
-
-
-def resolve_sensitive_env_variables(text):
- """
- Replace environment variable placeholders ($SENSITIVE_*) with their values.
- Only replaces variables that start with SENSITIVE_.
- """
- if not text:
- return text
-
- import re
-
- # Find all $SENSITIVE_* patterns
- env_vars = re.findall(r'\$SENSITIVE_[A-Za-z0-9_]*', text)
-
- result = text
- for var in env_vars:
- # Remove the $ prefix to get the actual environment variable name
- env_name = var[1:] # removes the $
- env_value = os.getenv(env_name)
- if env_value is not None:
- # Replace $SENSITIVE_VAR_NAME with its value
- result = result.replace(var, env_value)
-
- return result
-
-
-async def stop_agent():
- """Request the agent to stop and update UI with enhanced feedback"""
- global _global_agent
-
- try:
- if _global_agent is not None:
- # Request stop
- _global_agent.stop()
- # Update UI immediately
- message = "Stop requested - the agent will halt at the next safe point"
- logger.info(f"š {message}")
-
- # Return UI updates
- return (
- gr.update(value="Stopping...", interactive=False), # stop_button
- gr.update(interactive=False), # run_button
- )
- except Exception as e:
- error_msg = f"Error during stop: {str(e)}"
- logger.error(error_msg)
- return (
- gr.update(value="Stop", interactive=True),
- gr.update(interactive=True)
- )
-
-
-async def stop_research_agent():
- """Request the agent to stop and update UI with enhanced feedback"""
- global _global_agent_state
-
- try:
- # Request stop
- _global_agent_state.request_stop()
-
- # Update UI immediately
- message = "Stop requested - the agent will halt at the next safe point"
- logger.info(f"š {message}")
-
- # Return UI updates
- return ( # errors_output
- gr.update(value="Stopping...", interactive=False), # stop_button
- gr.update(interactive=False), # run_button
- )
- except Exception as e:
- error_msg = f"Error during stop: {str(e)}"
- logger.error(error_msg)
- return (
- gr.update(value="Stop", interactive=True),
- gr.update(interactive=True)
- )
-
-
-async def run_browser_agent(
- agent_type,
- llm_provider,
- llm_model_name,
- llm_num_ctx,
- llm_temperature,
- llm_base_url,
- llm_api_key,
- use_own_browser,
- keep_browser_open,
- headless,
- disable_security,
- window_w,
- window_h,
- save_recording_path,
- save_agent_history_path,
- save_trace_path,
- enable_recording,
- task,
- add_infos,
- max_steps,
- use_vision,
- max_actions_per_step,
- tool_calling_method,
- chrome_cdp,
- max_input_tokens
-):
- try:
- # Disable recording if the checkbox is unchecked
- if not enable_recording:
- save_recording_path = None
-
- # Ensure the recording directory exists if recording is enabled
- if save_recording_path:
- os.makedirs(save_recording_path, exist_ok=True)
-
- # Get the list of existing videos before the agent runs
- existing_videos = set()
- if save_recording_path:
- existing_videos = set(
- glob.glob(os.path.join(save_recording_path, "*.[mM][pP]4"))
- + glob.glob(os.path.join(save_recording_path, "*.[wW][eE][bB][mM]"))
- )
-
- task = resolve_sensitive_env_variables(task)
-
- # Run the agent
- llm = utils.get_llm_model(
- provider=llm_provider,
- model_name=llm_model_name,
- num_ctx=llm_num_ctx,
- temperature=llm_temperature,
- base_url=llm_base_url,
- api_key=llm_api_key,
- )
- if agent_type == "org":
- final_result, errors, model_actions, model_thoughts, trace_file, history_file = await run_org_agent(
- llm=llm,
- use_own_browser=use_own_browser,
- keep_browser_open=keep_browser_open,
- headless=headless,
- disable_security=disable_security,
- window_w=window_w,
- window_h=window_h,
- save_recording_path=save_recording_path,
- save_agent_history_path=save_agent_history_path,
- save_trace_path=save_trace_path,
- task=task,
- max_steps=max_steps,
- use_vision=use_vision,
- max_actions_per_step=max_actions_per_step,
- tool_calling_method=tool_calling_method,
- chrome_cdp=chrome_cdp,
- max_input_tokens=max_input_tokens
- )
- elif agent_type == "custom":
- final_result, errors, model_actions, model_thoughts, trace_file, history_file = await run_custom_agent(
- llm=llm,
- use_own_browser=use_own_browser,
- keep_browser_open=keep_browser_open,
- headless=headless,
- disable_security=disable_security,
- window_w=window_w,
- window_h=window_h,
- save_recording_path=save_recording_path,
- save_agent_history_path=save_agent_history_path,
- save_trace_path=save_trace_path,
- task=task,
- add_infos=add_infos,
- max_steps=max_steps,
- use_vision=use_vision,
- max_actions_per_step=max_actions_per_step,
- tool_calling_method=tool_calling_method,
- chrome_cdp=chrome_cdp,
- max_input_tokens=max_input_tokens
- )
- else:
- raise ValueError(f"Invalid agent type: {agent_type}")
-
- # Get the list of videos after the agent runs (if recording is enabled)
- # latest_video = None
- # if save_recording_path:
- # new_videos = set(
- # glob.glob(os.path.join(save_recording_path, "*.[mM][pP]4"))
- # + glob.glob(os.path.join(save_recording_path, "*.[wW][eE][bB][mM]"))
- # )
- # if new_videos - existing_videos:
- # latest_video = list(new_videos - existing_videos)[0] # Get the first new video
-
- gif_path = os.path.join(os.path.dirname(__file__), "agent_history.gif")
-
- return (
- final_result,
- errors,
- model_actions,
- model_thoughts,
- gif_path,
- trace_file,
- history_file,
- gr.update(value="Stop", interactive=True), # Re-enable stop button
- gr.update(interactive=True) # Re-enable run button
- )
-
- except MissingAPIKeyError as e:
- logger.error(str(e))
- raise gr.Error(str(e), print_exception=False)
-
- except Exception as e:
- import traceback
- traceback.print_exc()
- errors = str(e) + "\n" + traceback.format_exc()
- return (
- '', # final_result
- errors, # errors
- '', # model_actions
- '', # model_thoughts
- None, # latest_video
- None, # history_file
- None, # trace_file
- gr.update(value="Stop", interactive=True), # Re-enable stop button
- gr.update(interactive=True) # Re-enable run button
- )
-
-
-async def run_org_agent(
- llm,
- use_own_browser,
- keep_browser_open,
- headless,
- disable_security,
- window_w,
- window_h,
- save_recording_path,
- save_agent_history_path,
- save_trace_path,
- task,
- max_steps,
- use_vision,
- max_actions_per_step,
- tool_calling_method,
- chrome_cdp,
- max_input_tokens
-):
- try:
- global _global_browser, _global_browser_context, _global_agent
-
- extra_chromium_args = [f"--window-size={window_w},{window_h}"]
- cdp_url = chrome_cdp
-
- if use_own_browser:
- cdp_url = os.getenv("CHROME_CDP", chrome_cdp)
- chrome_path = os.getenv("CHROME_PATH", None)
- if chrome_path == "":
- chrome_path = None
- chrome_user_data = os.getenv("CHROME_USER_DATA", None)
- if chrome_user_data:
- extra_chromium_args += [f"--user-data-dir={chrome_user_data}"]
- else:
- chrome_path = None
-
- if _global_browser is None:
- _global_browser = Browser(
- config=BrowserConfig(
- headless=headless,
- cdp_url=cdp_url,
- disable_security=disable_security,
- chrome_instance_path=chrome_path,
- extra_chromium_args=extra_chromium_args,
- )
- )
-
- if _global_browser_context is None:
- _global_browser_context = await _global_browser.new_context(
- config=BrowserContextConfig(
- trace_path=save_trace_path if save_trace_path else None,
- save_recording_path=save_recording_path if save_recording_path else None,
- no_viewport=False,
- browser_window_size=BrowserContextWindowSize(
- width=window_w, height=window_h
- ),
- )
- )
-
- if _global_agent is None:
- _global_agent = Agent(
- task=task,
- llm=llm,
- use_vision=use_vision,
- browser=_global_browser,
- browser_context=_global_browser_context,
- max_actions_per_step=max_actions_per_step,
- tool_calling_method=tool_calling_method,
- max_input_tokens=max_input_tokens,
- generate_gif=True
- )
- history = await _global_agent.run(max_steps=max_steps)
-
- history_file = os.path.join(save_agent_history_path, f"{_global_agent.state.agent_id}.json")
- _global_agent.save_history(history_file)
-
- final_result = history.final_result()
- errors = history.errors()
- model_actions = history.model_actions()
- model_thoughts = history.model_thoughts()
-
- trace_file = get_latest_files(save_trace_path)
-
- return final_result, errors, model_actions, model_thoughts, trace_file.get('.zip'), history_file
- except Exception as e:
- import traceback
- traceback.print_exc()
- errors = str(e) + "\n" + traceback.format_exc()
- return '', errors, '', '', None, None
- finally:
- _global_agent = None
- # Handle cleanup based on persistence configuration
- if not keep_browser_open:
- if _global_browser_context:
- await _global_browser_context.close()
- _global_browser_context = None
-
- if _global_browser:
- await _global_browser.close()
- _global_browser = None
-
-
-async def run_custom_agent(
- llm,
- use_own_browser,
- keep_browser_open,
- headless,
- disable_security,
- window_w,
- window_h,
- save_recording_path,
- save_agent_history_path,
- save_trace_path,
- task,
- add_infos,
- max_steps,
- use_vision,
- max_actions_per_step,
- tool_calling_method,
- chrome_cdp,
- max_input_tokens
-):
- try:
- global _global_browser, _global_browser_context, _global_agent
-
- extra_chromium_args = [f"--window-size={window_w},{window_h}"]
- cdp_url = chrome_cdp
- if use_own_browser:
- cdp_url = os.getenv("CHROME_CDP", chrome_cdp)
-
- chrome_path = os.getenv("CHROME_PATH", None)
- if chrome_path == "":
- chrome_path = None
- chrome_user_data = os.getenv("CHROME_USER_DATA", None)
- if chrome_user_data:
- extra_chromium_args += [f"--user-data-dir={chrome_user_data}"]
- else:
- chrome_path = None
-
- controller = CustomController()
-
- # Initialize global browser if needed
- # if chrome_cdp not empty string nor None
- if (_global_browser is None) or (cdp_url and cdp_url != "" and cdp_url != None):
- _global_browser = CustomBrowser(
- config=BrowserConfig(
- headless=headless,
- disable_security=disable_security,
- cdp_url=cdp_url,
- chrome_instance_path=chrome_path,
- extra_chromium_args=extra_chromium_args,
- )
- )
-
- if _global_browser_context is None or (chrome_cdp and cdp_url != "" and cdp_url != None):
- _global_browser_context = await _global_browser.new_context(
- config=BrowserContextConfig(
- trace_path=save_trace_path if save_trace_path else None,
- save_recording_path=save_recording_path if save_recording_path else None,
- no_viewport=False,
- browser_window_size=BrowserContextWindowSize(
- width=window_w, height=window_h
- ),
- )
- )
-
- # Create and run agent
- if _global_agent is None:
- _global_agent = CustomAgent(
- task=task,
- add_infos=add_infos,
- use_vision=use_vision,
- llm=llm,
- browser=_global_browser,
- browser_context=_global_browser_context,
- controller=controller,
- system_prompt_class=CustomSystemPrompt,
- agent_prompt_class=CustomAgentMessagePrompt,
- max_actions_per_step=max_actions_per_step,
- tool_calling_method=tool_calling_method,
- max_input_tokens=max_input_tokens,
- generate_gif=True
- )
- history = await _global_agent.run(max_steps=max_steps)
-
- history_file = os.path.join(save_agent_history_path, f"{_global_agent.state.agent_id}.json")
- _global_agent.save_history(history_file)
-
- final_result = history.final_result()
- errors = history.errors()
- model_actions = history.model_actions()
- model_thoughts = history.model_thoughts()
-
- trace_file = get_latest_files(save_trace_path)
-
- return final_result, errors, model_actions, model_thoughts, trace_file.get('.zip'), history_file
- except Exception as e:
- import traceback
- traceback.print_exc()
- errors = str(e) + "\n" + traceback.format_exc()
- return '', errors, '', '', None, None
- finally:
- _global_agent = None
- # Handle cleanup based on persistence configuration
- if not keep_browser_open:
- if _global_browser_context:
- await _global_browser_context.close()
- _global_browser_context = None
-
- if _global_browser:
- await _global_browser.close()
- _global_browser = None
-
-
-async def run_with_stream(
- agent_type,
- llm_provider,
- llm_model_name,
- llm_num_ctx,
- llm_temperature,
- llm_base_url,
- llm_api_key,
- use_own_browser,
- keep_browser_open,
- headless,
- disable_security,
- window_w,
- window_h,
- save_recording_path,
- save_agent_history_path,
- save_trace_path,
- enable_recording,
- task,
- add_infos,
- max_steps,
- use_vision,
- max_actions_per_step,
- tool_calling_method,
- chrome_cdp,
- max_input_tokens
-):
- global _global_agent
-
- stream_vw = 80
- stream_vh = int(80 * window_h // window_w)
- if not headless:
- result = await run_browser_agent(
- agent_type=agent_type,
- llm_provider=llm_provider,
- llm_model_name=llm_model_name,
- llm_num_ctx=llm_num_ctx,
- llm_temperature=llm_temperature,
- llm_base_url=llm_base_url,
- llm_api_key=llm_api_key,
- use_own_browser=use_own_browser,
- keep_browser_open=keep_browser_open,
- headless=headless,
- disable_security=disable_security,
- window_w=window_w,
- window_h=window_h,
- save_recording_path=save_recording_path,
- save_agent_history_path=save_agent_history_path,
- save_trace_path=save_trace_path,
- enable_recording=enable_recording,
- task=task,
- add_infos=add_infos,
- max_steps=max_steps,
- use_vision=use_vision,
- max_actions_per_step=max_actions_per_step,
- tool_calling_method=tool_calling_method,
- chrome_cdp=chrome_cdp,
- max_input_tokens=max_input_tokens
- )
- # Add HTML content at the start of the result array
- yield [gr.update(visible=False)] + list(result)
- else:
- try:
- # Run the browser agent in the background
- agent_task = asyncio.create_task(
- run_browser_agent(
- agent_type=agent_type,
- llm_provider=llm_provider,
- llm_model_name=llm_model_name,
- llm_num_ctx=llm_num_ctx,
- llm_temperature=llm_temperature,
- llm_base_url=llm_base_url,
- llm_api_key=llm_api_key,
- use_own_browser=use_own_browser,
- keep_browser_open=keep_browser_open,
- headless=headless,
- disable_security=disable_security,
- window_w=window_w,
- window_h=window_h,
- save_recording_path=save_recording_path,
- save_agent_history_path=save_agent_history_path,
- save_trace_path=save_trace_path,
- enable_recording=enable_recording,
- task=task,
- add_infos=add_infos,
- max_steps=max_steps,
- use_vision=use_vision,
- max_actions_per_step=max_actions_per_step,
- tool_calling_method=tool_calling_method,
- chrome_cdp=chrome_cdp,
- max_input_tokens=max_input_tokens
- )
- )
-
- # Initialize values for streaming
- html_content = f"Using browser...
"
- final_result = errors = model_actions = model_thoughts = ""
- recording_gif = trace = history_file = None
-
- # Periodically update the stream while the agent task is running
- while not agent_task.done():
- try:
- encoded_screenshot = await capture_screenshot(_global_browser_context)
- if encoded_screenshot is not None:
- html_content = f'
'
- else:
- html_content = f"Waiting for browser session...
"
- except Exception as e:
- html_content = f"Waiting for browser session...
"
-
- if _global_agent and _global_agent.state.stopped:
- yield [
- gr.HTML(value=html_content, visible=True),
- final_result,
- errors,
- model_actions,
- model_thoughts,
- recording_gif,
- trace,
- history_file,
- gr.update(value="Stopping...", interactive=False), # stop_button
- gr.update(interactive=False), # run_button
- ]
- break
- else:
- yield [
- gr.HTML(value=html_content, visible=True),
- final_result,
- errors,
- model_actions,
- model_thoughts,
- recording_gif,
- trace,
- history_file,
- gr.update(), # Re-enable stop button
- gr.update() # Re-enable run button
- ]
- await asyncio.sleep(0.1)
-
- # Once the agent task completes, get the results
- try:
- result = await agent_task
- final_result, errors, model_actions, model_thoughts, recording_gif, trace, history_file, stop_button, run_button = result
- except gr.Error:
- final_result = ""
- model_actions = ""
- model_thoughts = ""
- recording_gif = trace = history_file = None
-
- except Exception as e:
- errors = f"Agent error: {str(e)}"
-
- yield [
- gr.HTML(value=html_content, visible=True),
- final_result,
- errors,
- model_actions,
- model_thoughts,
- recording_gif,
- trace,
- history_file,
- stop_button,
- run_button
- ]
-
- except Exception as e:
- import traceback
- yield [
- gr.HTML(
- value=f"Waiting for browser session...
",
- visible=True),
- "",
- f"Error: {str(e)}\n{traceback.format_exc()}",
- "",
- "",
- None,
- None,
- None,
- gr.update(value="Stop", interactive=True), # Re-enable stop button
- gr.update(interactive=True) # Re-enable run button
- ]
-
-
-# Define the theme map globally
-theme_map = {
- "Default": Default(),
- "Soft": Soft(),
- "Monochrome": Monochrome(),
- "Glass": Glass(),
- "Origin": Origin(),
- "Citrus": Citrus(),
- "Ocean": Ocean(),
- "Base": Base()
-}
-
-
-async def close_global_browser():
- global _global_browser, _global_browser_context
-
- if _global_browser_context:
- await _global_browser_context.close()
- _global_browser_context = None
-
- if _global_browser:
- await _global_browser.close()
- _global_browser = None
-
-
-async def run_deep_search(research_task, max_search_iteration_input, max_query_per_iter_input, llm_provider,
- llm_model_name, llm_num_ctx, llm_temperature, llm_base_url, llm_api_key, use_vision,
- use_own_browser, headless, chrome_cdp):
- from src.utils.deep_research import deep_research
- global _global_agent_state
-
- # Clear any previous stop request
- _global_agent_state.clear_stop()
-
- llm = utils.get_llm_model(
- provider=llm_provider,
- model_name=llm_model_name,
- num_ctx=llm_num_ctx,
- temperature=llm_temperature,
- base_url=llm_base_url,
- api_key=llm_api_key,
- )
- markdown_content, file_path = await deep_research(research_task, llm, _global_agent_state,
- max_search_iterations=max_search_iteration_input,
- max_query_num=max_query_per_iter_input,
- use_vision=use_vision,
- headless=headless,
- use_own_browser=use_own_browser,
- chrome_cdp=chrome_cdp
- )
-
- return markdown_content, file_path, gr.update(value="Stop", interactive=True), gr.update(interactive=True)
-
-
-def create_ui(theme_name="Ocean"):
- css = """
- .gradio-container {
- width: 60vw !important;
- max-width: 60% !important;
- margin-left: auto !important;
- margin-right: auto !important;
- padding-top: 20px !important;
- }
- .header-text {
- text-align: center;
- margin-bottom: 30px;
- }
- .theme-section {
- margin-bottom: 20px;
- padding: 15px;
- border-radius: 10px;
- }
- """
-
- with gr.Blocks(
- title="Browser Use WebUI", theme=theme_map[theme_name], css=css
- ) as demo:
- with gr.Row():
- gr.Markdown(
- """
- # š Browser Use WebUI
- ### Control your browser with AI assistance
- """,
- elem_classes=["header-text"],
- )
-
- with gr.Tabs() as tabs:
- with gr.TabItem("āļø Agent Settings", id=1):
- with gr.Group():
- agent_type = gr.Radio(
- ["org", "custom"],
- label="Agent Type",
- value="custom",
- info="Select the type of agent to use",
- interactive=True
- )
- with gr.Column():
- max_steps = gr.Slider(
- minimum=1,
- maximum=200,
- value=100,
- step=1,
- label="Max Run Steps",
- info="Maximum number of steps the agent will take",
- interactive=True
- )
- max_actions_per_step = gr.Slider(
- minimum=1,
- maximum=100,
- value=10,
- step=1,
- label="Max Actions per Step",
- info="Maximum number of actions the agent will take per step",
- interactive=True
- )
- with gr.Column():
- use_vision = gr.Checkbox(
- label="Use Vision",
- value=True,
- info="Enable visual processing capabilities",
- interactive=True
- )
- max_input_tokens = gr.Number(
- label="Max Input Tokens",
- value=128000,
- precision=0,
- interactive=True
- )
- tool_calling_method = gr.Dropdown(
- label="Tool Calling Method",
- value="auto",
- interactive=True,
- allow_custom_value=True, # Allow users to input custom model names
- choices=["auto", "json_schema", "function_calling"],
- info="Tool Calls Funtion Name",
- visible=False
- )
-
- with gr.TabItem("š§ LLM Settings", id=2):
- with gr.Group():
- llm_provider = gr.Dropdown(
- choices=[provider for provider, model in utils.model_names.items()],
- label="LLM Provider",
- value="openai",
- info="Select your preferred language model provider",
- interactive=True
- )
- llm_model_name = gr.Dropdown(
- label="Model Name",
- choices=utils.model_names['openai'],
- value="gpt-4o",
- interactive=True,
- allow_custom_value=True, # Allow users to input custom model names
- info="Select a model in the dropdown options or directly type a custom model name"
- )
- ollama_num_ctx = gr.Slider(
- minimum=2 ** 8,
- maximum=2 ** 16,
- value=16000,
- step=1,
- label="Ollama Context Length",
- info="Controls max context length model needs to handle (less = faster)",
- visible=False,
- interactive=True
- )
- llm_temperature = gr.Slider(
- minimum=0.0,
- maximum=2.0,
- value=0.6,
- step=0.1,
- label="Temperature",
- info="Controls randomness in model outputs",
- interactive=True
- )
- with gr.Row():
- llm_base_url = gr.Textbox(
- label="Base URL",
- value="",
- info="API endpoint URL (if required)"
- )
- llm_api_key = gr.Textbox(
- label="API Key",
- type="password",
- value="",
- info="Your API key (leave blank to use .env)"
- )
-
- # Change event to update context length slider
- def update_llm_num_ctx_visibility(llm_provider):
- return gr.update(visible=llm_provider == "ollama")
-
- # Bind the change event of llm_provider to update the visibility of context length slider
- llm_provider.change(
- fn=update_llm_num_ctx_visibility,
- inputs=llm_provider,
- outputs=ollama_num_ctx
- )
-
- with gr.TabItem("š Browser Settings", id=3):
- with gr.Group():
- with gr.Row():
- use_own_browser = gr.Checkbox(
- label="Use Own Browser",
- value=False,
- info="Use your existing browser instance",
- interactive=True
- )
- keep_browser_open = gr.Checkbox(
- label="Keep Browser Open",
- value=False,
- info="Keep Browser Open between Tasks",
- interactive=True
- )
- headless = gr.Checkbox(
- label="Headless Mode",
- value=False,
- info="Run browser without GUI",
- interactive=True
- )
- disable_security = gr.Checkbox(
- label="Disable Security",
- value=True,
- info="Disable browser security features",
- interactive=True
- )
- enable_recording = gr.Checkbox(
- label="Enable Recording",
- value=True,
- info="Enable saving browser recordings",
- interactive=True
- )
-
- with gr.Row():
- window_w = gr.Number(
- label="Window Width",
- value=1280,
- info="Browser window width",
- interactive=True
- )
- window_h = gr.Number(
- label="Window Height",
- value=1100,
- info="Browser window height",
- interactive=True
- )
-
- chrome_cdp = gr.Textbox(
- label="CDP URL",
- placeholder="http://localhost:9222",
- value="",
- info="CDP for google remote debugging",
- interactive=True, # Allow editing only if recording is enabled
- )
-
- save_recording_path = gr.Textbox(
- label="Recording Path",
- placeholder="e.g. ./tmp/record_videos",
- value="./tmp/record_videos",
- info="Path to save browser recordings",
- interactive=True, # Allow editing only if recording is enabled
- )
-
- save_trace_path = gr.Textbox(
- label="Trace Path",
- placeholder="e.g. ./tmp/traces",
- value="./tmp/traces",
- info="Path to save Agent traces",
- interactive=True,
- )
-
- save_agent_history_path = gr.Textbox(
- label="Agent History Save Path",
- placeholder="e.g., ./tmp/agent_history",
- value="./tmp/agent_history",
- info="Specify the directory where agent history should be saved.",
- interactive=True,
- )
-
- with gr.TabItem("š¤ Run Agent", id=4):
- task = gr.Textbox(
- label="Task Description",
- lines=4,
- placeholder="Enter your task here...",
- value="go to google.com and type 'OpenAI' click search and give me the first url",
- info="Describe what you want the agent to do",
- interactive=True
- )
- add_infos = gr.Textbox(
- label="Additional Information",
- lines=3,
- placeholder="Add any helpful context or instructions...",
- info="Optional hints to help the LLM complete the task",
- value="",
- interactive=True
- )
-
- with gr.Row():
- run_button = gr.Button("ā¶ļø Run Agent", variant="primary", scale=2)
- stop_button = gr.Button("ā¹ļø Stop", variant="stop", scale=1)
-
- with gr.Row():
- browser_view = gr.HTML(
- value="Waiting for browser session...
",
- label="Live Browser View",
- visible=False
- )
-
- gr.Markdown("### Results")
- with gr.Row():
- with gr.Column():
- final_result_output = gr.Textbox(
- label="Final Result", lines=3, show_label=True
- )
- with gr.Column():
- errors_output = gr.Textbox(
- label="Errors", lines=3, show_label=True
- )
- with gr.Row():
- with gr.Column():
- model_actions_output = gr.Textbox(
- label="Model Actions", lines=3, show_label=True, visible=False
- )
- with gr.Column():
- model_thoughts_output = gr.Textbox(
- label="Model Thoughts", lines=3, show_label=True, visible=False
- )
- recording_gif = gr.Image(label="Result GIF", format="gif")
- trace_file = gr.File(label="Trace File")
- agent_history_file = gr.File(label="Agent History")
-
- with gr.TabItem("š§ Deep Research", id=5):
- research_task_input = gr.Textbox(label="Research Task", lines=5,
- value="Compose a report on the use of Reinforcement Learning for training Large Language Models, encompassing its origins, current advancements, and future prospects, substantiated with examples of relevant models and techniques. The report should reflect original insights and analysis, moving beyond mere summarization of existing literature.",
- interactive=True)
- with gr.Row():
- max_search_iteration_input = gr.Number(label="Max Search Iteration", value=3,
- precision=0,
- interactive=True) # precision=0 ē”®äæęÆę“ę°
- max_query_per_iter_input = gr.Number(label="Max Query per Iteration", value=1,
- precision=0,
- interactive=True) # precision=0 ē”®äæęÆę“ę°
- with gr.Row():
- research_button = gr.Button("ā¶ļø Run Deep Research", variant="primary", scale=2)
- stop_research_button = gr.Button("ā¹ Stop", variant="stop", scale=1)
- markdown_output_display = gr.Markdown(label="Research Report")
- markdown_download = gr.File(label="Download Research Report")
-
- # Bind the stop button click event after errors_output is defined
- stop_button.click(
- fn=stop_agent,
- inputs=[],
- outputs=[stop_button, run_button],
- )
-
- # Run button click handler
- run_button.click(
- fn=run_with_stream,
- inputs=[
- agent_type, llm_provider, llm_model_name, ollama_num_ctx, llm_temperature, llm_base_url,
- llm_api_key,
- use_own_browser, keep_browser_open, headless, disable_security, window_w, window_h,
- save_recording_path, save_agent_history_path, save_trace_path, # Include the new path
- enable_recording, task, add_infos, max_steps, use_vision, max_actions_per_step,
- tool_calling_method, chrome_cdp, max_input_tokens
- ],
- outputs=[
- browser_view, # Browser view
- final_result_output, # Final result
- errors_output, # Errors
- model_actions_output, # Model actions
- model_thoughts_output, # Model thoughts
- recording_gif, # Latest recording
- trace_file, # Trace file
- agent_history_file, # Agent history file
- stop_button, # Stop button
- run_button # Run button
- ],
- )
-
- # Run Deep Research
- research_button.click(
- fn=run_deep_search,
- inputs=[research_task_input, max_search_iteration_input, max_query_per_iter_input, llm_provider,
- llm_model_name, ollama_num_ctx, llm_temperature, llm_base_url, llm_api_key, use_vision,
- use_own_browser, headless, chrome_cdp],
- outputs=[markdown_output_display, markdown_download, stop_research_button, research_button]
- )
- # Bind the stop button click event after errors_output is defined
- stop_research_button.click(
- fn=stop_research_agent,
- inputs=[],
- outputs=[stop_research_button, research_button],
- )
-
- with gr.TabItem("š„ Recordings", id=7, visible=True):
- def list_recordings(save_recording_path):
- if not os.path.exists(save_recording_path):
- return []
-
- # Get all video files
- recordings = glob.glob(os.path.join(save_recording_path, "*.[mM][pP]4")) + glob.glob(
- os.path.join(save_recording_path, "*.[wW][eE][bB][mM]"))
-
- # Sort recordings by creation time (oldest first)
- recordings.sort(key=os.path.getctime)
-
- # Add numbering to the recordings
- numbered_recordings = []
- for idx, recording in enumerate(recordings, start=1):
- filename = os.path.basename(recording)
- numbered_recordings.append((recording, f"{idx}. {filename}"))
-
- return numbered_recordings
-
- recordings_gallery = gr.Gallery(
- label="Recordings",
- columns=3,
- height="auto",
- object_fit="contain"
- )
-
- refresh_button = gr.Button("š Refresh Recordings", variant="secondary")
- refresh_button.click(
- fn=list_recordings,
- inputs=save_recording_path,
- outputs=recordings_gallery
- )
-
- with gr.TabItem("š UI Configuration", id=8):
- config_file_input = gr.File(
- label="Load UI Settings from Config File",
- file_types=[".json"],
- interactive=True
- )
- with gr.Row():
- load_config_button = gr.Button("Load Config", variant="primary")
- save_config_button = gr.Button("Save UI Settings", variant="primary")
-
- config_status = gr.Textbox(
- label="Status",
- lines=2,
- interactive=False
- )
- save_config_button.click(
- fn=save_current_config,
- inputs=[], # äøéč¦č¾å
„åę°
- outputs=[config_status]
- )
-
- # Attach the callback to the LLM provider dropdown
- llm_provider.change(
- lambda provider, api_key, base_url: update_model_dropdown(provider, api_key, base_url),
- inputs=[llm_provider, llm_api_key, llm_base_url],
- outputs=llm_model_name
- )
-
- # Add this after defining the components
- enable_recording.change(
- lambda enabled: gr.update(interactive=enabled),
- inputs=enable_recording,
- outputs=save_recording_path
- )
-
- use_own_browser.change(fn=close_global_browser)
- keep_browser_open.change(fn=close_global_browser)
-
- scan_and_register_components(demo)
- global webui_config_manager
- all_components = webui_config_manager.get_all_components()
-
- load_config_button.click(
- fn=update_ui_from_config,
- inputs=[config_file_input],
- outputs=all_components + [config_status]
- )
- return demo
+from src.webui.interface import theme_map, create_ui
def main():
- parser = argparse.ArgumentParser(description="Gradio UI for Browser Agent")
+ parser = argparse.ArgumentParser(description="Gradio WebUI for Browser Agent")
parser.add_argument("--ip", type=str, default="127.0.0.1", help="IP address to bind to")
parser.add_argument("--port", type=int, default=7788, help="Port to listen on")
parser.add_argument("--theme", type=str, default="Ocean", choices=theme_map.keys(), help="Theme to use for the UI")
args = parser.parse_args()
demo = create_ui(theme_name=args.theme)
- demo.launch(server_name=args.ip, server_port=args.port)
+ demo.queue().launch(server_name=args.ip, server_port=args.port)
if __name__ == '__main__':