merge dockerfile

This commit is contained in:
alexwarm
2025-05-09 20:34:30 +08:00
parent 81c0f47775
commit a04773266c
13 changed files with 625 additions and 637 deletions

View File

@@ -40,14 +40,14 @@ ANONYMIZED_TELEMETRY=false
# LogLevel: Set to debug to enable verbose logging, set to result to get results only. Available: result | debug | info
BROWSER_USE_LOGGING_LEVEL=info
# Chrome settings
CHROME_PATH=
CHROME_USER_DATA=
CHROME_DEBUGGING_PORT=9222
CHROME_DEBUGGING_HOST=localhost
# Browser settings
BROWSER_PATH=
BROWSER_USER_DATA=
BROWSER_DEBUGGING_PORT=9222
BROWSER_DEBUGGING_HOST=localhost
# Set to true to keep browser open between AI tasks
CHROME_PERSISTENT_SESSION=false
CHROME_CDP=
KEEP_BROWSER_OPEN=true
BROWSER_CDP=
# Display settings
# Format: WIDTHxHEIGHTxDEPTH
RESOLUTION=1920x1080x24

View File

@@ -1,5 +1,8 @@
FROM python:3.11-slim
# Set platform for multi-arch builds (Docker Buildx will set this)
ARG TARGETPLATFORM
# Install system dependencies
RUN apt-get update && apt-get install -y \
wget \
@@ -28,7 +31,6 @@ RUN apt-get update && apt-get install -y \
fonts-liberation \
dbus \
xauth \
xvfb \
x11vnc \
tigervnc-tools \
supervisor \
@@ -47,33 +49,45 @@ RUN git clone https://github.com/novnc/noVNC.git /opt/novnc \
&& git clone https://github.com/novnc/websockify /opt/novnc/utils/websockify \
&& ln -s /opt/novnc/vnc.html /opt/novnc/index.html
# Set platform for ARM64 compatibility
ARG TARGETPLATFORM=linux/amd64
# Set up working directory
WORKDIR /app
# Copy requirements and install Python dependencies
COPY requirements.txt .
RUN pip install --no-cache-dir -r requirements.txt
# Ensure 'patchright' is in your requirements.txt or install it directly
# RUN pip install --no-cache-dir -r requirements.txt patchright # If not in requirements
RUN pip install --no-cache-dir -r requirements.txt # Assuming patchright is in requirements.txt
RUN pip install --no-cache-dir patchright # Or install it explicitly
# Install Playwright and browsers with system dependencies
ENV PLAYWRIGHT_BROWSERS_PATH=/ms-playwright
RUN playwright install --with-deps chromium
RUN playwright install-deps
# Install Patchright browsers and dependencies
# Patchright documentation suggests PLAYWRIGHT_BROWSERS_PATH is still relevant
# or that Patchright installs to a similar default location that Playwright would.
# Let's assume Patchright respects PLAYWRIGHT_BROWSERS_PATH or its default install location is findable.
ENV PLAYWRIGHT_BROWSERS_PATH=/ms-browsers
RUN mkdir -p $PLAYWRIGHT_BROWSERS_PATH
# Install recommended: Google Chrome (instead of just Chromium for better undetectability)
# The 'patchright install chrome' command might download and place it.
# The '--with-deps' equivalent for patchright install is to run 'patchright install-deps chrome' after.
RUN patchright install chrome
RUN patchright install-deps chrome
# Alternative: Install Chromium if Google Chrome is problematic in certain environments
RUN patchright install chromium
RUN patchright install-deps chromium
# Copy the application code
COPY . .
# Set environment variables
# Set environment variables (Updated Names)
ENV PYTHONUNBUFFERED=1
ENV BROWSER_USE_LOGGING_LEVEL=info
ENV CHROME_PATH=/ms-playwright/chromium-*/chrome-linux/chrome
# BROWSER_PATH will be determined by Patchright installation, supervisord will find it.
ENV ANONYMIZED_TELEMETRY=false
ENV DISPLAY=:99
ENV RESOLUTION=1920x1080x24
ENV VNC_PASSWORD=vncpassword
ENV CHROME_PERSISTENT_SESSION=true
ENV VNC_PASSWORD=youvncpassword
ENV KEEP_BROWSER_OPEN=true
ENV RESOLUTION_WIDTH=1920
ENV RESOLUTION_HEIGHT=1080
@@ -81,6 +95,6 @@ ENV RESOLUTION_HEIGHT=1080
RUN mkdir -p /var/log/supervisor
COPY supervisord.conf /etc/supervisor/conf.d/supervisord.conf
EXPOSE 7788 6080 5901
EXPOSE 7788 6080 5901 9222
CMD ["/usr/bin/supervisord", "-c", "/etc/supervisor/conf.d/supervisord.conf"]
CMD ["/usr/bin/supervisord", "-c", "/etc/supervisor/conf.d/supervisord.conf"]

View File

@@ -1,85 +0,0 @@
FROM python:3.11-slim
# Install system dependencies
RUN apt-get update && apt-get install -y \
wget \
gnupg \
curl \
unzip \
xvfb \
libgconf-2-4 \
libxss1 \
libnss3 \
libnspr4 \
libasound2 \
libatk1.0-0 \
libatk-bridge2.0-0 \
libcups2 \
libdbus-1-3 \
libdrm2 \
libgbm1 \
libgtk-3-0 \
libxcomposite1 \
libxdamage1 \
libxfixes3 \
libxrandr2 \
xdg-utils \
fonts-liberation \
dbus \
xauth \
xvfb \
x11vnc \
tigervnc-tools \
supervisor \
net-tools \
procps \
git \
python3-numpy \
fontconfig \
fonts-dejavu \
fonts-dejavu-core \
fonts-dejavu-extra \
&& rm -rf /var/lib/apt/lists/*
# Install noVNC
RUN git clone https://github.com/novnc/noVNC.git /opt/novnc \
&& git clone https://github.com/novnc/websockify /opt/novnc/utils/websockify \
&& ln -s /opt/novnc/vnc.html /opt/novnc/index.html
# Set platform explicitly for ARM64
ARG TARGETPLATFORM=linux/arm64
# Set up working directory
WORKDIR /app
# Copy requirements and install Python dependencies
COPY requirements.txt .
RUN pip install --no-cache-dir -r requirements.txt
# Install Playwright and browsers with system dependencies optimized for ARM64
ENV PLAYWRIGHT_BROWSERS_PATH=/ms-playwright
RUN PLAYWRIGHT_SKIP_BROWSER_DOWNLOAD=1 pip install playwright && \
playwright install --with-deps chromium
# Copy the application code
COPY . .
# Set environment variables
ENV PYTHONUNBUFFERED=1
ENV BROWSER_USE_LOGGING_LEVEL=info
ENV CHROME_PATH=/ms-playwright/chromium-*/chrome-linux/chrome
ENV ANONYMIZED_TELEMETRY=false
ENV DISPLAY=:99
ENV RESOLUTION=1920x1080x24
ENV VNC_PASSWORD=vncpassword
ENV CHROME_PERSISTENT_SESSION=true
ENV RESOLUTION_WIDTH=1920
ENV RESOLUTION_HEIGHT=1080
# Set up supervisor configuration
RUN mkdir -p /var/log/supervisor
COPY supervisord.conf /etc/supervisor/conf.d/supervisord.conf
EXPOSE 7788 6080 5901
CMD ["/usr/bin/supervisord", "-c", "/etc/supervisor/conf.d/supervisord.conf"]

132
README.md
View File

@@ -23,10 +23,6 @@ We would like to officially thank [WarmShao](https://github.com/warmshao) for hi
## Installation Guide
### Prerequisites
- Python 3.11 or higher
- Git (for cloning the repository)
### Option 1: Local Installation
Read the [quickstart guide](https://docs.browser-use.com/quickstart#prepare-the-environment) or follow the steps below to get started.
@@ -65,10 +61,13 @@ Install Python packages:
uv pip install -r requirements.txt
```
Install Browsers in Playwright:
You can install specific browsers by running:
Install Browsers in Patchright.
```bash
patchright install chromium
patchright install
```
Or you can install specific browsers by running:
```bash
patchright install chromium --with-deps --no-shell
```
#### Step 4: Configure Environment
@@ -83,6 +82,42 @@ cp .env.example .env
```
2. Open `.env` in your preferred text editor and add your API keys and other settings
#### Local Setup
1. **Run the WebUI:**
After completing the installation steps above, start the application:
```bash
python webui.py --ip 127.0.0.1 --port 7788
```
2. WebUI options:
- `--ip`: The IP address to bind the WebUI to. Default is `127.0.0.1`.
- `--port`: The port to bind the WebUI to. Default is `7788`.
- `--theme`: The theme for the user interface. Default is `Ocean`.
- **Default**: The standard theme with a balanced design.
- **Soft**: A gentle, muted color scheme for a relaxed viewing experience.
- **Monochrome**: A grayscale theme with minimal color for simplicity and focus.
- **Glass**: A sleek, semi-transparent design for a modern appearance.
- **Origin**: A classic, retro-inspired theme for a nostalgic feel.
- **Citrus**: A vibrant, citrus-inspired palette with bright and fresh colors.
- **Ocean** (default): A blue, ocean-inspired theme providing a calming effect.
- `--dark-mode`: Enables dark mode for the user interface.
3. **Access the WebUI:** Open your web browser and navigate to `http://127.0.0.1:7788`.
4. **Using Your Own Browser(Optional):**
- Set `CHROME_PATH` to the executable path of your browser and `CHROME_USER_DATA` to the user data directory of your browser. Leave `CHROME_USER_DATA` empty if you want to use local user data.
- Windows
```env
CHROME_PATH="C:\Program Files\Google\Chrome\Application\chrome.exe"
CHROME_USER_DATA="C:\Users\YourUsername\AppData\Local\Google\Chrome\User Data"
```
> Note: Replace `YourUsername` with your actual Windows username for Windows systems.
- Mac
```env
CHROME_PATH="/Applications/Google Chrome.app/Contents/MacOS/Google Chrome"
CHROME_USER_DATA="/Users/YourUsername/Library/Application Support/Google/Chrome"
```
- Close all Chrome windows
- Open the WebUI in a non-Chrome browser, such as Firefox or Edge. This is important because the persistent browser context will use the Chrome data when running the agent.
- Check the "Use Own Browser" option within the Browser Settings.
### Option 2: Docker Installation
#### Prerequisites
@@ -118,95 +153,12 @@ docker compose up --build
CHROME_PERSISTENT_SESSION=true docker compose up --build
```
4. Access the Application:
- Web Interface: Open `http://localhost:7788` in your browser
- VNC Viewer (for watching browser interactions): Open `http://localhost:6080/vnc.html`
- Default VNC password: "youvncpassword"
- Can be changed by setting `VNC_PASSWORD` in your `.env` file
## Usage
### Local Setup
1. **Run the WebUI:**
After completing the installation steps above, start the application:
```bash
python webui.py --ip 127.0.0.1 --port 7788
```
2. WebUI options:
- `--ip`: The IP address to bind the WebUI to. Default is `127.0.0.1`.
- `--port`: The port to bind the WebUI to. Default is `7788`.
- `--theme`: The theme for the user interface. Default is `Ocean`.
- **Default**: The standard theme with a balanced design.
- **Soft**: A gentle, muted color scheme for a relaxed viewing experience.
- **Monochrome**: A grayscale theme with minimal color for simplicity and focus.
- **Glass**: A sleek, semi-transparent design for a modern appearance.
- **Origin**: A classic, retro-inspired theme for a nostalgic feel.
- **Citrus**: A vibrant, citrus-inspired palette with bright and fresh colors.
- **Ocean** (default): A blue, ocean-inspired theme providing a calming effect.
- `--dark-mode`: Enables dark mode for the user interface.
3. **Access the WebUI:** Open your web browser and navigate to `http://127.0.0.1:7788`.
4. **Using Your Own Browser(Optional):**
- Set `CHROME_PATH` to the executable path of your browser and `CHROME_USER_DATA` to the user data directory of your browser. Leave `CHROME_USER_DATA` empty if you want to use local user data.
- Windows
```env
CHROME_PATH="C:\Program Files\Google\Chrome\Application\chrome.exe"
CHROME_USER_DATA="C:\Users\YourUsername\AppData\Local\Google\Chrome\User Data"
```
> Note: Replace `YourUsername` with your actual Windows username for Windows systems.
- Mac
```env
CHROME_PATH="/Applications/Google Chrome.app/Contents/MacOS/Google Chrome"
CHROME_USER_DATA="/Users/YourUsername/Library/Application Support/Google/Chrome"
```
- Close all Chrome windows
- Open the WebUI in a non-Chrome browser, such as Firefox or Edge. This is important because the persistent browser context will use the Chrome data when running the agent.
- Check the "Use Own Browser" option within the Browser Settings.
5. **Keep Browser Open(Optional):**
- Set `CHROME_PERSISTENT_SESSION=true` in the `.env` file.
### Docker Setup
1. **Environment Variables:**
- All configuration is done through the `.env` file
- Available environment variables:
```
# LLM API Keys
OPENAI_API_KEY=your_key_here
ANTHROPIC_API_KEY=your_key_here
GOOGLE_API_KEY=your_key_here
# Browser Settings
CHROME_PERSISTENT_SESSION=true # Set to true to keep browser open between AI tasks
RESOLUTION=1920x1080x24 # Custom resolution format: WIDTHxHEIGHTxDEPTH
RESOLUTION_WIDTH=1920 # Custom width in pixels
RESOLUTION_HEIGHT=1080 # Custom height in pixels
# VNC Settings
VNC_PASSWORD=your_vnc_password # Optional, defaults to "vncpassword"
```
2. **Platform Support:**
- Supports both AMD64 and ARM64 architectures
- For ARM64 systems (e.g., Apple Silicon Macs), the container will automatically use the appropriate image
3. **Browser Persistence Modes:**
- **Default Mode (CHROME_PERSISTENT_SESSION=false):**
- Browser opens and closes with each AI task
- Clean state for each interaction
- Lower resource usage
- **Persistent Mode (CHROME_PERSISTENT_SESSION=true):**
- Browser stays open between AI tasks
- Maintains history and state
- Allows viewing previous AI interactions
- Set in `.env` file or via environment variable when starting container
4. **Viewing Browser Interactions:**
- Access the noVNC viewer at `http://localhost:6080/vnc.html`
- Enter the VNC password (default: "vncpassword" or what you set in VNC_PASSWORD)
- Direct VNC access available on port 5900 (mapped to container port 5901)
- You can now see all browser interactions in real-time
5. **Container Management:**
```bash
# Start with persistent browser

View File

@@ -1,62 +1,76 @@
services:
browser-use-webui:
platform: linux/amd64
build:
context: .
dockerfile: ${DOCKERFILE:-Dockerfile}
dockerfile: Dockerfile
args:
TARGETPLATFORM: ${TARGETPLATFORM:-linux/amd64}
ports:
- "7788:7788" # Gradio default port
- "6080:6080" # noVNC web interface
- "5901:5901" # VNC port
- "9222:9222" # Chrome remote debugging port
- "7788:7788"
- "6080:6080"
- "5901:5901"
- "9222:9222"
environment:
# LLM API Keys & Endpoints (Your existing list)
- OPENAI_ENDPOINT=${OPENAI_ENDPOINT:-https://api.openai.com/v1}
- OPENAI_API_KEY=${OPENAI_API_KEY:-}
- ANTHROPIC_API_KEY=${ANTHROPIC_API_KEY:-}
- ANTHROPIC_ENDPOINT=${ANTHROPIC_ENDPOINT:-https://api.anthropic.com}
- ANTHROPIC_API_KEY=${ANTHROPIC_API_KEY:-}
- GOOGLE_API_KEY=${GOOGLE_API_KEY:-}
- AZURE_OPENAI_ENDPOINT=${AZURE_OPENAI_ENDPOINT:-}
- AZURE_OPENAI_API_KEY=${AZURE_OPENAI_API_KEY:-}
- AZURE_OPENAI_API_VERSION=${AZURE_OPENAI_API_VERSION:-2025-01-01-preview}
- DEEPSEEK_ENDPOINT=${DEEPSEEK_ENDPOINT:-https://api.deepseek.com}
- DEEPSEEK_API_KEY=${DEEPSEEK_API_KEY:-}
- OLLAMA_ENDPOINT=${OLLAMA_ENDPOINT:-http://localhost:11434}
- MISTRAL_API_KEY=${MISTRAL_API_KEY:-}
- MISTRAL_ENDPOINT=${MISTRAL_ENDPOINT:-https://api.mistral.ai/v1}
- MISTRAL_API_KEY=${MISTRAL_API_KEY:-}
- ALIBABA_ENDPOINT=${ALIBABA_ENDPOINT:-https://dashscope.aliyuncs.com/compatible-mode/v1}
- ALIBABA_API_KEY=${ALIBABA_API_KEY:-}
- MOONSHOT_ENDPOINT=${MOONSHOT_ENDPOINT:-https://api.moonshot.cn/v1}
- MOONSHOT_API_KEY=${MOONSHOT_API_KEY:-}
- IBM_API_KEY=${IBM_API_KEY:-}
- UNBOUND_ENDPOINT=${UNBOUND_ENDPOINT:-https://api.getunbound.ai}
- UNBOUND_API_KEY=${UNBOUND_API_KEY:-}
- SiliconFLOW_ENDPOINT=${SiliconFLOW_ENDPOINT:-https://api.siliconflow.cn/v1/}
- SiliconFLOW_API_KEY=${SiliconFLOW_API_KEY:-}
- IBM_ENDPOINT=${IBM_ENDPOINT:-https://us-south.ml.cloud.ibm.com}
- IBM_API_KEY=${IBM_API_KEY:-}
- IBM_PROJECT_ID=${IBM_PROJECT_ID:-}
- BROWSER_USE_LOGGING_LEVEL=${BROWSER_USE_LOGGING_LEVEL:-info}
# Application Settings
- ANONYMIZED_TELEMETRY=${ANONYMIZED_TELEMETRY:-false}
- CHROME_PATH=/usr/bin/google-chrome
- CHROME_USER_DATA=/app/data/chrome_data
- CHROME_PERSISTENT_SESSION=${CHROME_PERSISTENT_SESSION:-false}
- CHROME_CDP=${CHROME_CDP:-http://localhost:9222}
- BROWSER_USE_LOGGING_LEVEL=${BROWSER_USE_LOGGING_LEVEL:-info}
# Browser Settings
- BROWSER_USER_DATA=${BROWSER_USER_DATA:-/app/data/chrome_data}
- BROWSER_DEBUGGING_PORT=${BROWSER_DEBUGGING_PORT:-9222}
- BROWSER_DEBUGGING_HOST=${BROWSER_DEBUGGING_HOST:-0.0.0.0}
- KEEP_BROWSER_OPEN=${KEEP_BROWSER_OPEN:-true}
- BROWSER_CDP=${BROWSER_CDP:-} # e.g., http://localhost:9222
# Display Settings
- DISPLAY=:99
- PLAYWRIGHT_BROWSERS_PATH=/ms-playwright
# This ENV is used by the Dockerfile during build time if Patchright respects it.
# It's not strictly needed at runtime by docker-compose unless your app or scripts also read it.
- PLAYWRIGHT_BROWSERS_PATH=/ms-browsers # Matches Dockerfile ENV
- RESOLUTION=${RESOLUTION:-1920x1080x24}
- RESOLUTION_WIDTH=${RESOLUTION_WIDTH:-1920}
- RESOLUTION_HEIGHT=${RESOLUTION_HEIGHT:-1080}
- VNC_PASSWORD=${VNC_PASSWORD:-vncpassword}
- CHROME_DEBUGGING_PORT=9222
- CHROME_DEBUGGING_HOST=localhost
# VNC Settings
- VNC_PASSWORD=${VNC_PASSWORD:-youvncpassword}
volumes:
- /tmp/.X11-unix:/tmp/.X11-unix
# - ./my_chrome_data:/app/data/chrome_data # Optional: persist browser data
restart: unless-stopped
shm_size: '2gb'
cap_add:
- SYS_ADMIN
security_opt:
- seccomp=unconfined
tmpfs:
- /tmp
healthcheck:
test: ["CMD", "nc", "-z", "localhost", "5901"]
test: ["CMD", "nc", "-z", "localhost", "5901"] # VNC port
interval: 10s
timeout: 5s
retries: 3
retries: 3

View File

@@ -1,4 +0,0 @@
#!/bin/bash
# Start supervisord in the foreground to properly manage child processes
exec /usr/bin/supervisord -n -c /etc/supervisor/conf.d/supervisord.conf

View File

@@ -1,4 +1,4 @@
browser-use==0.1.43
browser-use==0.1.45
pyperclip==1.9.0
gradio==5.27.0
json-repair

View File

@@ -20,6 +20,7 @@ from browser_use.telemetry.views import (
)
from browser_use.utils import time_execution_async
from dotenv import load_dotenv
from browser_use.agent.message_manager.utils import is_model_without_tool_support
load_dotenv()
logger = logging.getLogger(__name__)
@@ -30,6 +31,22 @@ SKIP_LLM_API_KEY_VERIFICATION = (
class BrowserUseAgent(Agent):
def _set_tool_calling_method(self) -> ToolCallingMethod | None:
tool_calling_method = self.settings.tool_calling_method
if tool_calling_method == 'auto':
if is_model_without_tool_support(self.model_name):
return 'raw'
elif self.chat_model_library == 'ChatGoogleGenerativeAI':
return None
elif self.chat_model_library == 'ChatOpenAI':
return 'function_calling'
elif self.chat_model_library == 'AzureChatOpenAI':
return 'function_calling'
else:
return None
else:
return tool_calling_method
@time_execution_async("--run (agent)")
async def run(
self, max_steps: int = 100, on_step_start: AgentHookFunc | None = None,

View File

@@ -29,7 +29,7 @@ from langchain_core.tools import StructuredTool, Tool
from langgraph.graph import StateGraph
from pydantic import BaseModel, Field
from browser_use.browser.context import BrowserContextWindowSize, BrowserContextConfig
from browser_use.browser.context import BrowserContextConfig
from src.agent.browser_use.browser_use_agent import BrowserUseAgent
from src.browser.custom_browser import CustomBrowser
@@ -82,22 +82,19 @@ async def run_single_browser_task(
try:
logger.info(f"Starting browser task for query: {task_query}")
extra_args = [f"--window-size={window_w},{window_h}"]
if browser_user_data_dir:
extra_args.append(f"--user-data-dir={browser_user_data_dir}")
if use_own_browser:
browser_binary_path = os.getenv("CHROME_PATH", None) or browser_binary_path
browser_binary_path = os.getenv("BROWSER_PATH", None) or browser_binary_path
if browser_binary_path == "":
browser_binary_path = None
chrome_user_data = os.getenv("CHROME_USER_DATA", None)
if chrome_user_data:
extra_args += [f"--user-data-dir={chrome_user_data}"]
browser_user_data = browser_user_data_dir or os.getenv("BROWSER_USER_DATA", None)
if browser_user_data:
extra_args += [f"--user-data-dir={browser_user_data}"]
else:
browser_binary_path = None
bu_browser = CustomBrowser(
config=BrowserConfig(
headless=headless,
disable_security=False,
browser_binary_path=browser_binary_path,
extra_browser_args=extra_args,
wss_url=wss_url,
@@ -107,7 +104,8 @@ async def run_single_browser_task(
context_config = BrowserContextConfig(
save_downloads_path="./tmp/downloads",
browser_window_size=BrowserContextWindowSize(width=window_w, height=window_h),
window_height=window_h,
window_width=window_w,
force_new_context=True,
)
bu_browser_context = await bu_browser.new_context(config=context_config)
@@ -299,30 +297,34 @@ Provide a list of distinct search queries(up to {max_parallel_browsers}) that ar
# --- Langgraph State Definition ---
class ResearchPlanItem(TypedDict):
step: int
task: str
class ResearchTaskItem(TypedDict):
# step: int # Maybe step within category, or just implicit by order
task_description: str
status: str # "pending", "completed", "failed"
queries: Optional[List[str]] # Queries generated for this task
result_summary: Optional[str] # Optional brief summary after execution
queries: Optional[List[str]]
result_summary: Optional[str]
class ResearchCategoryItem(TypedDict):
category_name: str
tasks: List[ResearchTaskItem]
# Optional: category_status: str # Could be "pending", "in_progress", "completed"
class DeepResearchState(TypedDict):
task_id: str
topic: str
research_plan: List[ResearchPlanItem]
search_results: List[Dict[str, Any]] # Stores results from browser_search_tool_func
# messages: Sequence[BaseMessage] # History for ReAct-like steps within nodes
llm: Any # The LLM instance
research_plan: List[ResearchCategoryItem] # CHANGED
search_results: List[Dict[str, Any]]
llm: Any
tools: List[Tool]
output_dir: Path
browser_config: Dict[str, Any]
final_report: Optional[str]
current_step_index: int # To track progress through the plan
stop_requested: bool # Flag to signal termination
# Add other state variables as needed
error_message: Optional[str] # To store errors
current_category_index: int
current_task_index_in_category: int
stop_requested: bool
error_message: Optional[str]
messages: List[BaseMessage]
@@ -330,44 +332,75 @@ class DeepResearchState(TypedDict):
def _load_previous_state(task_id: str, output_dir: str) -> Dict[str, Any]:
"""Loads state from files if they exist."""
state_updates = {}
plan_file = os.path.join(output_dir, PLAN_FILENAME)
search_file = os.path.join(output_dir, SEARCH_INFO_FILENAME)
loaded_plan: List[ResearchCategoryItem] = []
next_cat_idx, next_task_idx = 0, 0
found_pending = False
if os.path.exists(plan_file):
try:
with open(plan_file, "r", encoding="utf-8") as f:
# Basic parsing, assumes markdown checklist format
plan = []
step = 1
for line in f:
line = line.strip()
if line.startswith(("- [x]", "- [ ]")):
status = "completed" if line.startswith("- [x]") else "pending"
task = line[5:].strip()
plan.append(
ResearchPlanItem(
step=step,
task=task,
status=status,
queries=None,
result_summary=None,
)
current_category: Optional[ResearchCategoryItem] = None
lines = f.readlines()
cat_counter = 0
task_counter_in_cat = 0
for line_num, line_content in enumerate(lines):
line = line_content.strip()
if line.startswith("## "): # Category
if current_category: # Save previous category
loaded_plan.append(current_category)
if not found_pending: # If previous category was all done, advance cat counter
cat_counter += 1
task_counter_in_cat = 0
category_name = line[line.find(" "):].strip() # Get text after "## X. "
current_category = ResearchCategoryItem(category_name=category_name, tasks=[])
elif (line.startswith("- [ ]") or line.startswith("- [x]") or line.startswith(
"- [-]")) and current_category: # Task
status = "pending"
if line.startswith("- [x]"):
status = "completed"
elif line.startswith("- [-]"):
status = "failed"
task_desc = line[5:].strip()
current_category["tasks"].append(
ResearchTaskItem(task_description=task_desc, status=status, queries=None,
result_summary=None)
)
step += 1
state_updates["research_plan"] = plan
# Determine next step index based on loaded plan
next_step = next(
(i for i, item in enumerate(plan) if item["status"] == "pending"),
len(plan),
)
state_updates["current_step_index"] = next_step
if status == "pending" and not found_pending:
next_cat_idx = cat_counter
next_task_idx = task_counter_in_cat
found_pending = True
if not found_pending: # only increment if previous tasks were completed/failed
task_counter_in_cat += 1
if current_category: # Append last category
loaded_plan.append(current_category)
if loaded_plan:
state_updates["research_plan"] = loaded_plan
if not found_pending and loaded_plan: # All tasks were completed or failed
next_cat_idx = len(loaded_plan) # Points beyond the last category
next_task_idx = 0
state_updates["current_category_index"] = next_cat_idx
state_updates["current_task_index_in_category"] = next_task_idx
logger.info(
f"Loaded research plan from {plan_file}, next step index: {next_step}"
f"Loaded hierarchical research plan from {plan_file}. "
f"Next task: Category {next_cat_idx}, Task {next_task_idx} in category."
)
else:
logger.warning(f"Plan file {plan_file} was empty or malformed.")
except Exception as e:
logger.error(f"Failed to load or parse research plan {plan_file}: {e}")
logger.error(f"Failed to load or parse research plan {plan_file}: {e}", exc_info=True)
state_updates["error_message"] = f"Failed to load research plan: {e}"
else:
logger.info(f"Plan file {plan_file} not found. Will start fresh.")
if os.path.exists(search_file):
try:
with open(search_file, "r", encoding="utf-8") as f:
@@ -375,22 +408,25 @@ def _load_previous_state(task_id: str, output_dir: str) -> Dict[str, Any]:
logger.info(f"Loaded search results from {search_file}")
except Exception as e:
logger.error(f"Failed to load search results {search_file}: {e}")
state_updates["error_message"] = f"Failed to load search results: {e}"
# Decide if this is fatal or if we can continue without old results
state_updates["error_message"] = (
state_updates.get("error_message", "") + f" Failed to load search results: {e}").strip()
return state_updates
def _save_plan_to_md(plan: List[ResearchPlanItem], output_dir: str):
"""Saves the research plan to a markdown checklist file."""
def _save_plan_to_md(plan: List[ResearchCategoryItem], output_dir: str):
plan_file = os.path.join(output_dir, PLAN_FILENAME)
try:
with open(plan_file, "w", encoding="utf-8") as f:
f.write("# Research Plan\n\n")
for item in plan:
marker = "- [x]" if item["status"] == "completed" else "- [ ]"
f.write(f"{marker} {item['task']}\n")
logger.info(f"Research plan saved to {plan_file}")
f.write(f"# Research Plan\n\n")
for cat_idx, category in enumerate(plan):
f.write(f"## {cat_idx + 1}. {category['category_name']}\n\n")
for task_idx, task in enumerate(category['tasks']):
marker = "- [x]" if task["status"] == "completed" else "- [ ]" if task[
"status"] == "pending" else "- [-]" # [-] for failed
f.write(f" {marker} {task['task_description']}\n")
f.write("\n")
logger.info(f"Hierarchical research plan saved to {plan_file}")
except Exception as e:
logger.error(f"Failed to save research plan to {plan_file}: {e}")
@@ -419,7 +455,6 @@ def _save_report_to_md(report: str, output_dir: Path):
async def planning_node(state: DeepResearchState) -> Dict[str, Any]:
"""Generates the initial research plan or refines it if resuming."""
logger.info("--- Entering Planning Node ---")
if state.get("stop_requested"):
logger.info("Stop requested, skipping planning.")
@@ -428,293 +463,344 @@ async def planning_node(state: DeepResearchState) -> Dict[str, Any]:
llm = state["llm"]
topic = state["topic"]
existing_plan = state.get("research_plan")
existing_results = state.get("search_results")
output_dir = state["output_dir"]
if existing_plan and state.get("current_step_index", 0) > 0:
if existing_plan and (
state.get("current_category_index", 0) > 0 or state.get("current_task_index_in_category", 0) > 0):
logger.info("Resuming with existing plan.")
# Maybe add logic here to let LLM review and potentially adjust the plan
# based on existing_results, but for now, we just use the loaded plan.
_save_plan_to_md(existing_plan, output_dir) # Ensure it's saved initially
return {"research_plan": existing_plan} # Return the loaded plan
# current_category_index and current_task_index_in_category should be set by _load_previous_state
return {"research_plan": existing_plan}
logger.info(f"Generating new research plan for topic: {topic}")
prompt = ChatPromptTemplate.from_messages(
[
(
"system",
"""You are a meticulous research assistant. Your goal is to create a step-by-step research plan to thoroughly investigate a given topic.
The plan should consist of clear, actionable research tasks or questions. Each step should logically build towards a comprehensive understanding.
Format the output as a numbered list. Each item should represent a distinct research step or question.
Example:
1. Define the core concepts and terminology related to [Topic].
2. Identify the key historical developments of [Topic].
3. Analyze the current state-of-the-art and recent advancements in [Topic].
4. Investigate the major challenges and limitations associated with [Topic].
5. Explore the future trends and potential applications of [Topic].
6. Summarize the findings and draw conclusions.
prompt_text = f"""You are a meticulous research assistant. Your goal is to create a hierarchical research plan to thoroughly investigate the topic: "{topic}".
The plan should be structured into several main research categories. Each category should contain a list of specific, actionable research tasks or questions.
Format the output as a JSON list of objects. Each object represents a research category and should have:
1. "category_name": A string for the name of the research category.
2. "tasks": A list of strings, where each string is a specific research task for that category.
Keep the plan focused and manageable. Aim for 5-10 detailed steps.
""",
),
("human", f"Generate a research plan for the topic: {topic}"),
]
)
Example JSON Output:
[
{{
"category_name": "Understanding Core Concepts and Definitions",
"tasks": [
"Define the primary terminology associated with '{topic}'.",
"Identify the fundamental principles and theories underpinning '{topic}'."
]
}},
{{
"category_name": "Historical Development and Key Milestones",
"tasks": [
"Trace the historical evolution of '{topic}'.",
"Identify key figures, events, or breakthroughs in the development of '{topic}'."
]
}},
{{
"category_name": "Current State-of-the-Art and Applications",
"tasks": [
"Analyze the current advancements and prominent applications of '{topic}'.",
"Investigate ongoing research and active areas of development related to '{topic}'."
]
}},
{{
"category_name": "Challenges, Limitations, and Future Outlook",
"tasks": [
"Identify the major challenges and limitations currently facing '{topic}'.",
"Explore potential future trends, ethical considerations, and societal impacts of '{topic}'."
]
}}
]
Generate a plan with 3-10 categories, and 2-6 tasks per category for the topic: "{topic}" according to the complexity of the topic.
Ensure the output is a valid JSON array.
"""
messages = [
SystemMessage(content="You are a research planning assistant outputting JSON."),
HumanMessage(content=prompt_text)
]
try:
response = await llm.ainvoke(prompt.format_prompt(topic=topic).to_messages())
plan_text = response.content
response = await llm.ainvoke(messages)
raw_content = response.content
# The LLM might wrap the JSON in backticks
if raw_content.strip().startswith("```json"):
raw_content = raw_content.strip()[7:-3].strip()
elif raw_content.strip().startswith("```"):
raw_content = raw_content.strip()[3:-3].strip()
# Parse the numbered list into the plan structure
new_plan: List[ResearchPlanItem] = []
for i, line in enumerate(plan_text.strip().split("\n")):
line = line.strip()
if line and (line[0].isdigit() or line.startswith(("*", "-"))):
# Simple parsing: remove number/bullet and space
task_text = (
line.split(".", 1)[-1].strip()
if line[0].isdigit()
else line[1:].strip()
)
if task_text:
new_plan.append(
ResearchPlanItem(
step=i + 1,
task=task_text,
logger.debug(f"LLM response for plan: {raw_content}")
parsed_plan_from_llm = json.loads(raw_content)
new_plan: List[ResearchCategoryItem] = []
for cat_idx, category_data in enumerate(parsed_plan_from_llm):
if not isinstance(category_data,
dict) or "category_name" not in category_data or "tasks" not in category_data:
logger.warning(f"Skipping invalid category data: {category_data}")
continue
tasks: List[ResearchTaskItem] = []
for task_idx, task_desc in enumerate(category_data["tasks"]):
if isinstance(task_desc, str):
tasks.append(
ResearchTaskItem(
task_description=task_desc,
status="pending",
queries=None,
result_summary=None,
)
)
else: # Sometimes LLM puts tasks as {"task": "description"}
if isinstance(task_desc, dict) and "task_description" in task_desc:
tasks.append(
ResearchTaskItem(
task_description=task_desc["task_description"],
status="pending",
queries=None,
result_summary=None,
)
)
elif isinstance(task_desc, dict) and "task" in task_desc: # common LLM mistake
tasks.append(
ResearchTaskItem(
task_description=task_desc["task"],
status="pending",
queries=None,
result_summary=None,
)
)
else:
logger.warning(
f"Skipping invalid task data: {task_desc} in category {category_data['category_name']}")
new_plan.append(
ResearchCategoryItem(
category_name=category_data["category_name"],
tasks=tasks,
)
)
if not new_plan:
logger.error("LLM failed to generate a valid plan structure.")
logger.error("LLM failed to generate a valid plan structure from JSON.")
return {"error_message": "Failed to generate research plan structure."}
logger.info(f"Generated research plan with {len(new_plan)} steps.")
_save_plan_to_md(new_plan, output_dir)
logger.info(f"Generated research plan with {len(new_plan)} categories.")
_save_plan_to_md(new_plan, output_dir) # Save the hierarchical plan
return {
"research_plan": new_plan,
"current_step_index": 0, # Start from the beginning
"search_results": [], # Initialize search results
"current_category_index": 0,
"current_task_index_in_category": 0,
"search_results": [],
}
except json.JSONDecodeError as e:
logger.error(f"Failed to parse JSON from LLM for plan: {e}. Response was: {raw_content}", exc_info=True)
return {"error_message": f"LLM generated invalid JSON for research plan: {e}"}
except Exception as e:
logger.error(f"Error during planning: {e}", exc_info=True)
return {"error_message": f"LLM Error during planning: {e}"}
async def research_execution_node(state: DeepResearchState) -> Dict[str, Any]:
"""
Executes the next step in the research plan by invoking the LLM with tools.
The LLM decides which tool (e.g., browser search) to use and provides arguments.
"""
logger.info("--- Entering Research Execution Node ---")
if state.get("stop_requested"):
logger.info("Stop requested, skipping research execution.")
return {
"stop_requested": True,
"current_step_index": state["current_step_index"],
} # Keep index same
"current_category_index": state["current_category_index"],
"current_task_index_in_category": state["current_task_index_in_category"],
}
plan = state["research_plan"]
current_index = state["current_step_index"]
cat_idx = state["current_category_index"]
task_idx = state["current_task_index_in_category"]
llm = state["llm"]
tools = state["tools"] # Tools are now passed in state
tools = state["tools"]
output_dir = str(state["output_dir"])
task_id = state["task_id"]
# Stop event is bound inside the tool function, no need to pass directly here
task_id = state["task_id"] # For _AGENT_STOP_FLAGS
if not plan or current_index >= len(plan):
logger.info("Research plan complete or empty.")
# This condition should ideally be caught by `should_continue` before reaching here
return {}
# This check should ideally be handled by `should_continue`
if not plan or cat_idx >= len(plan):
logger.info("Research plan complete or categories exhausted.")
return {} # should route to synthesis
current_step = plan[current_index]
if current_step["status"] == "completed":
logger.info(f"Step {current_step['step']} already completed, skipping.")
return {"current_step_index": current_index + 1} # Move to next step
current_category = plan[cat_idx]
if task_idx >= len(current_category["tasks"]):
logger.info(f"All tasks in category '{current_category['category_name']}' completed. Moving to next category.")
# This logic is now effectively handled by should_continue and the index updates below
# The next iteration will be caught by should_continue or this node with updated indices
return {
"current_category_index": cat_idx + 1,
"current_task_index_in_category": 0,
"messages": state["messages"] # Pass messages along
}
current_task = current_category["tasks"][task_idx]
if current_task["status"] == "completed":
logger.info(
f"Task '{current_task['task_description']}' in category '{current_category['category_name']}' already completed. Skipping.")
# Logic to find next task
next_task_idx = task_idx + 1
next_cat_idx = cat_idx
if next_task_idx >= len(current_category["tasks"]):
next_cat_idx += 1
next_task_idx = 0
return {
"current_category_index": next_cat_idx,
"current_task_index_in_category": next_task_idx,
"messages": state["messages"] # Pass messages along
}
logger.info(
f"Executing research step {current_step['step']}: {current_step['task']}"
f"Executing research task: '{current_task['task_description']}' (Category: '{current_category['category_name']}')"
)
# Bind tools to the LLM for this call
llm_with_tools = llm.bind_tools(tools)
if state["messages"]:
current_task_message = [
HumanMessage(
content=f"Research Task (Step {current_step['step']}): {current_step['task']}"
)
]
invocation_messages = state["messages"] + current_task_message
# Construct messages for LLM invocation
task_prompt_content = (
f"Current Research Category: {current_category['category_name']}\n"
f"Specific Task: {current_task['task_description']}\n\n"
"Please use the available tools, especially 'parallel_browser_search', to gather information for this specific task. "
"Provide focused search queries relevant ONLY to this task. "
"If you believe you have sufficient information from previous steps for this specific task, you can indicate that you are ready to summarize or that no further search is needed."
)
current_task_message_history = [
HumanMessage(content=task_prompt_content)
]
if not state["messages"]: # First actual execution message
invocation_messages = [
SystemMessage(
content="You are a research assistant executing one task of a research plan. Focus on the current task only."),
] + current_task_message_history
else:
current_task_message = [
SystemMessage(
content="You are a research assistant executing one step of a research plan. Use the available tools, especially the 'parallel_browser_search' tool, to gather information needed for the current task. Be precise with your search queries if using the browser tool."
),
HumanMessage(
content=f"Research Task (Step {current_step['step']}): {current_step['task']}"
),
]
invocation_messages = current_task_message
invocation_messages = state["messages"] + current_task_message_history
try:
# Invoke the LLM, expecting it to make a tool call
logger.info(f"Invoking LLM with tools for task: {current_step['task']}")
logger.info(f"Invoking LLM with tools for task: {current_task['task_description']}")
ai_response: BaseMessage = await llm_with_tools.ainvoke(invocation_messages)
logger.info("LLM invocation complete.")
tool_results = []
executed_tool_names = []
current_search_results = state.get("search_results", []) # Get existing search results
if not isinstance(ai_response, AIMessage) or not ai_response.tool_calls:
# LLM didn't call a tool. Maybe it answered directly? Or failed?
logger.warning(
f"LLM did not call any tool for step {current_step['step']}. Response: {ai_response.content[:100]}..."
)
# How to handle this? Mark step as failed? Or store the content?
# Let's mark as failed for now, assuming a tool was expected.
current_step["status"] = "failed"
current_step["result_summary"] = "LLM did not use a tool as expected."
_save_plan_to_md(plan, output_dir)
return {
"research_plan": plan,
"status": "pending",
"current_step_index": current_index,
"messages": [
f"LLM failed to call a tool for step {current_step['step']}. Response: {ai_response.content}"
f". Please use tool to do research unless you are thinking or summary"],
}
# Process tool calls
for tool_call in ai_response.tool_calls:
tool_name = tool_call.get("name")
tool_args = tool_call.get("args", {})
tool_call_id = tool_call.get("id") # Important for ToolMessage
logger.info(f"LLM requested tool call: {tool_name} with args: {tool_args}")
executed_tool_names.append(tool_name)
# Find the corresponding tool instance
selected_tool = next((t for t in tools if t.name == tool_name), None)
if not selected_tool:
logger.error(f"LLM called tool '{tool_name}' which is not available.")
# Create a ToolMessage indicating the error
tool_results.append(
ToolMessage(
content=f"Error: Tool '{tool_name}' not found.",
tool_call_id=tool_call_id,
)
)
continue # Skip to next tool call if any
# Execute the tool
try:
# Stop check before executing the tool (tool itself also checks)
stop_event = _AGENT_STOP_FLAGS.get(task_id)
if stop_event and stop_event.is_set():
logger.info(f"Stop requested before executing tool: {tool_name}")
current_step["status"] = "pending" # Not completed due to stop
_save_plan_to_md(plan, output_dir)
return {"stop_requested": True, "research_plan": plan}
logger.info(f"Executing tool: {tool_name}")
# Assuming tool functions handle async correctly
tool_output = await selected_tool.ainvoke(tool_args)
logger.info(f"Tool '{tool_name}' executed successfully.")
browser_tool_called = "parallel_browser_search" in executed_tool_names
# Append result to overall search results
current_search_results = state.get("search_results", [])
if browser_tool_called: # Specific handling for browser tool output
current_search_results.extend(tool_output)
else: # Handle other tool outputs (e.g., file tools return strings)
# Store it associated with the step? Or a generic log?
# Let's just log it for now. Need better handling for diverse tool outputs.
logger.info(
f"Result from tool '{tool_name}': {str(tool_output)[:200]}..."
)
# Store result for potential next LLM call (if we were doing multi-turn)
tool_results.append(
ToolMessage(
content=json.dumps(tool_output), tool_call_id=tool_call_id
)
)
except Exception as e:
logger.error(f"Error executing tool '{tool_name}': {e}", exc_info=True)
tool_results.append(
ToolMessage(
content=f"Error executing tool {tool_name}: {e}",
tool_call_id=tool_call_id,
)
)
# Also update overall state search_results with error?
current_search_results = state.get("search_results", [])
current_search_results.append(
{
"tool_name": tool_name,
"args": tool_args,
"status": "failed",
"error": str(e),
}
)
# Basic check: Did the browser tool run at all? (More specific checks needed)
browser_tool_called = "parallel_browser_search" in executed_tool_names
# We might need a more nuanced status based on the *content* of tool_results
step_failed = (
any("Error:" in str(tr.content) for tr in tool_results)
or not browser_tool_called
)
if step_failed:
logger.warning(
f"Step {current_step['step']} failed or did not yield results via browser search."
)
current_step["status"] = "failed"
current_step["result_summary"] = (
f"Tool execution failed or browser tool not used. Errors: {[tr.content for tr in tool_results if 'Error' in str(tr.content)]}"
f"LLM did not call any tool for task '{current_task['task_description']}'. Response: {ai_response.content[:100]}..."
)
current_task["status"] = "pending" # Or "completed_no_tool" if LLM explains it's done
current_task["result_summary"] = f"LLM did not use a tool. Response: {ai_response.content}"
current_task["current_category_index"] = cat_idx
current_task["current_task_index_in_category"] = task_idx
return current_task
# We still save the plan and advance.
else:
logger.info(
f"Step {current_step['step']} completed using tool(s): {executed_tool_names}."
)
current_step["status"] = "completed"
# Process tool calls
for tool_call in ai_response.tool_calls:
tool_name = tool_call.get("name")
tool_args = tool_call.get("args", {})
tool_call_id = tool_call.get("id")
current_step["result_summary"] = (
f"Executed tool(s): {', '.join(executed_tool_names)}."
)
logger.info(f"LLM requested tool call: {tool_name} with args: {tool_args}")
executed_tool_names.append(tool_name)
selected_tool = next((t for t in tools if t.name == tool_name), None)
if not selected_tool:
logger.error(f"LLM called tool '{tool_name}' which is not available.")
tool_results.append(
ToolMessage(content=f"Error: Tool '{tool_name}' not found.", tool_call_id=tool_call_id))
continue
try:
stop_event = _AGENT_STOP_FLAGS.get(task_id)
if stop_event and stop_event.is_set():
logger.info(f"Stop requested before executing tool: {tool_name}")
current_task["status"] = "pending" # Or a new "stopped" status
_save_plan_to_md(plan, output_dir)
return {"stop_requested": True, "research_plan": plan, "current_category_index": cat_idx,
"current_task_index_in_category": task_idx}
logger.info(f"Executing tool: {tool_name}")
tool_output = await selected_tool.ainvoke(tool_args)
logger.info(f"Tool '{tool_name}' executed successfully.")
if tool_name == "parallel_browser_search":
current_search_results.extend(tool_output) # tool_output is List[Dict]
else: # For other tools, we might need specific handling or just log
logger.info(f"Result from tool '{tool_name}': {str(tool_output)[:200]}...")
# Storing non-browser results might need a different structure or key in search_results
current_search_results.append(
{"tool_name": tool_name, "args": tool_args, "output": str(tool_output),
"status": "completed"})
tool_results.append(ToolMessage(content=json.dumps(tool_output), tool_call_id=tool_call_id))
except Exception as e:
logger.error(f"Error executing tool '{tool_name}': {e}", exc_info=True)
tool_results.append(
ToolMessage(content=f"Error executing tool {tool_name}: {e}", tool_call_id=tool_call_id))
current_search_results.append(
{"tool_name": tool_name, "args": tool_args, "status": "failed", "error": str(e)})
# After processing all tool calls for this task
step_failed_tool_execution = any("Error:" in str(tr.content) for tr in tool_results)
# Consider a task successful if a browser search was attempted and didn't immediately error out during call
# The browser search itself returns status for each query.
browser_tool_attempted_successfully = "parallel_browser_search" in executed_tool_names and not step_failed_tool_execution
if step_failed_tool_execution:
current_task["status"] = "failed"
current_task[
"result_summary"] = f"Tool execution failed. Errors: {[tr.content for tr in tool_results if 'Error' in str(tr.content)]}"
elif executed_tool_names: # If any tool was called
current_task["status"] = "completed"
current_task["result_summary"] = f"Executed tool(s): {', '.join(executed_tool_names)}."
# TODO: Could ask LLM to summarize the tool_results for this task if needed, rather than just listing tools.
else: # No tool calls but AI response had .tool_calls structure (empty)
current_task["status"] = "failed" # Or a more specific status
current_task["result_summary"] = "LLM prepared for tool call but provided no tools."
# Save progress
_save_plan_to_md(plan, output_dir)
_save_search_results_to_json(current_search_results, output_dir)
# Determine next indices
next_task_idx = task_idx + 1
next_cat_idx = cat_idx
if next_task_idx >= len(current_category["tasks"]):
next_cat_idx += 1
next_task_idx = 0
updated_messages = state["messages"] + current_task_message_history + [ai_response] + tool_results
return {
"research_plan": plan,
"search_results": current_search_results, # Update with new results
"current_step_index": current_index + 1,
"messages": state["messages"]
+ current_task_message
+ [ai_response]
+ tool_results,
# Optionally return the tool_results messages if needed by downstream nodes
"search_results": current_search_results,
"current_category_index": next_cat_idx,
"current_task_index_in_category": next_task_idx,
"messages": updated_messages,
}
except Exception as e:
logger.error(
f"Unhandled error during research execution node for step {current_step['step']}: {e}",
exc_info=True,
)
current_step["status"] = "failed"
logger.error(f"Unhandled error during research execution for task '{current_task['task_description']}': {e}",
exc_info=True)
current_task["status"] = "failed"
_save_plan_to_md(plan, output_dir)
# Determine next indices even on error to attempt to move on
next_task_idx = task_idx + 1
next_cat_idx = cat_idx
if next_task_idx >= len(current_category["tasks"]):
next_cat_idx += 1
next_task_idx = 0
return {
"research_plan": plan,
"current_step_index": current_index + 1, # Move on even if error?
"error_message": f"Core Execution Error on step {current_step['step']}: {e}",
"current_category_index": next_cat_idx,
"current_task_index_in_category": next_task_idx,
"error_message": f"Core Execution Error on task '{current_task['task_description']}': {e}",
"messages": state["messages"] + current_task_message_history # Preserve messages up to error
}
@@ -747,36 +833,37 @@ async def synthesis_node(state: DeepResearchState) -> Dict[str, Any]:
references = {}
ref_count = 1
for i, result_entry in enumerate(search_results):
query = result_entry.get("query", "Unknown Query")
query = result_entry.get("query", "Unknown Query") # From parallel_browser_search
tool_name = result_entry.get("tool_name") # From other tools
status = result_entry.get("status", "unknown")
result_data = result_entry.get(
"result"
) # This should be the dict with summary, title, url
error = result_entry.get("error")
result_data = result_entry.get("result") # From BrowserUseAgent's final_result
tool_output_str = result_entry.get("output") # From other tools
if status == "completed" and result_data:
summary = result_data
formatted_results += f'### Finding from Query: "{query}"\n'
formatted_results += f"- **Summary:**\n{summary}\n"
if tool_name == "parallel_browser_search" and status == "completed" and result_data:
# result_data is the summary from BrowserUseAgent
formatted_results += f'### Finding from Web Search Query: "{query}"\n'
formatted_results += f"- **Summary:**\n{result_data}\n" # result_data is already a summary string here
# If result_data contained title/URL, you'd format them here.
# The current BrowserUseAgent returns a string summary directly as 'final_data' in run_single_browser_task
formatted_results += "---\n"
elif tool_name != "parallel_browser_search" and status == "completed" and tool_output_str:
formatted_results += f'### Finding from Tool: "{tool_name}" (Args: {result_entry.get("args")})\n'
formatted_results += f"- **Output:**\n{tool_output_str}\n"
formatted_results += "---\n"
elif status == "failed":
formatted_results += f'### Failed Query: "{query}"\n'
error = result_entry.get("error")
q_or_t = f"Query: \"{query}\"" if query != "Unknown Query" else f"Tool: \"{tool_name}\""
formatted_results += f'### Failed {q_or_t}\n'
formatted_results += f"- **Error:** {error}\n"
formatted_results += "---\n"
# Ignore cancelled/other statuses for the report content
# Prepare the research plan context
plan_summary = "\nResearch Plan Followed:\n"
for item in plan:
marker = (
"- [x]"
if item["status"] == "completed"
else "- [ ] (Failed)"
if item["status"] == "failed"
else "- [ ]"
)
plan_summary += f"{marker} {item['task']}\n"
for cat_idx, category in enumerate(plan):
plan_summary += f"\n#### Category {cat_idx + 1}: {category['category_name']}\n"
for task_idx, task in enumerate(category['tasks']):
marker = "[x]" if task["status"] == "completed" else "[ ]" if task["status"] == "pending" else "[-]"
plan_summary += f" - {marker} {task['task_description']}\n"
synthesis_prompt = ChatPromptTemplate.from_messages(
[
@@ -785,29 +872,28 @@ async def synthesis_node(state: DeepResearchState) -> Dict[str, Any]:
"""You are a professional researcher tasked with writing a comprehensive and well-structured report based on collected findings.
The report should address the research topic thoroughly, synthesizing the information gathered from various sources.
Structure the report logically:
1. **Introduction:** Briefly introduce the topic and the report's scope (mentioning the research plan followed is good).
2. **Main Body:** Discuss the key findings, organizing them thematically or according to the research plan steps. Analyze, compare, and contrast information from different sources where applicable. **Crucially, cite your sources using bracketed numbers [X] corresponding to the reference list.**
3. **Conclusion:** Summarize the main points and offer concluding thoughts or potential areas for further research.
1. Briefly introduce the topic and the report's scope (mentioning the research plan followed, including categories and tasks, is good).
2. Discuss the key findings, organizing them thematically, possibly aligning with the research categories. Analyze, compare, and contrast information.
3. Summarize the main points and offer concluding thoughts.
Ensure the tone is objective, professional, and analytical. Base the report **strictly** on the provided findings. Do not add external knowledge. If findings are contradictory or incomplete, acknowledge this.
""",
Ensure the tone is objective and professional.
If findings are contradictory or incomplete, acknowledge this.
""", # Removed citation part for simplicity for now, as browser agent returns summaries.
),
(
"human",
f"""
**Research Topic:** {topic}
**Research Topic:** {topic}
{plan_summary}
{plan_summary}
**Collected Findings:**
```
{formatted_results}
```
**Collected Findings:**
```
{formatted_results}
```
```
Please generate the final research report in Markdown format based **only** on the information above. Ensure all claims derived from the findings are properly cited using the format [Reference_ID].
""",
Please generate the final research report in Markdown format based **only** on the information above.
""",
),
]
)
@@ -818,7 +904,6 @@ async def synthesis_node(state: DeepResearchState) -> Dict[str, Any]:
topic=topic,
plan_summary=plan_summary,
formatted_results=formatted_results,
references=references,
).to_messages()
)
final_report_md = response.content
@@ -847,34 +932,44 @@ async def synthesis_node(state: DeepResearchState) -> Dict[str, Any]:
def should_continue(state: DeepResearchState) -> str:
"""Determines the next step based on the current state."""
logger.info("--- Evaluating Condition: Should Continue? ---")
if state.get("stop_requested"):
logger.info("Stop requested, routing to END.")
return "end_run" # Go to a dedicated end node for cleanup if needed
if state.get("error_message"):
logger.warning(f"Error detected: {state['error_message']}. Routing to END.")
# Decide if errors should halt execution or if it should try to synthesize anyway
return "end_run" # Stop on error for now
return "end_run"
if state.get("error_message") and "Core Execution Error" in state["error_message"]: # Critical error in node
logger.warning(f"Critical error detected: {state['error_message']}. Routing to END.")
return "end_run"
plan = state.get("research_plan")
current_index = state.get("current_step_index", 0)
cat_idx = state.get("current_category_index", 0)
task_idx = state.get("current_task_index_in_category", 0) # This is the *next* task to check
if not plan:
logger.warning(
"No research plan found, cannot continue execution. Routing to END."
)
return "end_run" # Should not happen if planning node ran correctly
logger.warning("No research plan found. Routing to END.")
return "end_run"
# Check if there are pending steps in the plan
if current_index < len(plan):
logger.info(
f"Plan has pending steps (current index {current_index}/{len(plan)}). Routing to Research Execution."
)
return "execute_research"
else:
logger.info("All plan steps processed. Routing to Synthesis.")
return "synthesize_report"
# Check if the current indices point to a valid pending task
if cat_idx < len(plan):
current_category = plan[cat_idx]
if task_idx < len(current_category["tasks"]):
# We are trying to execute the task at plan[cat_idx]["tasks"][task_idx]
# The research_execution_node will handle if it's already completed.
logger.info(
f"Plan has potential pending tasks (next up: Category {cat_idx}, Task {task_idx}). Routing to Research Execution."
)
return "execute_research"
else: # task_idx is out of bounds for current category, means we need to check next category
if cat_idx + 1 < len(plan): # If there is a next category
logger.info(
f"Finished tasks in category {cat_idx}. Moving to category {cat_idx + 1}. Routing to Research Execution."
)
# research_execution_node will update state to {current_category_index: cat_idx + 1, current_task_index_in_category: 0}
# Or rather, the previous execution node already set these indices to the start of the next category.
return "execute_research"
# If we've gone through all categories and tasks (cat_idx >= len(plan))
logger.info("All plan categories and tasks processed or current indices are out of bounds. Routing to Synthesis.")
return "synthesize_report"
# --- DeepSearchAgent Class ---
@@ -1033,22 +1128,24 @@ class DeepResearchAgent:
"messages": [],
"llm": self.llm,
"tools": agent_tools,
"output_dir": output_dir,
"output_dir": Path(output_dir),
"browser_config": self.browser_config,
"final_report": None,
"current_step_index": 0,
"current_category_index": 0,
"current_task_index_in_category": 0,
"stop_requested": False,
"error_message": None,
}
loaded_state = {}
if task_id:
logger.info(f"Attempting to resume task {task_id}...")
loaded_state = _load_previous_state(task_id, output_dir)
initial_state.update(loaded_state)
if loaded_state.get("research_plan"):
logger.info(
f"Resuming with {len(loaded_state['research_plan'])} plan steps and {len(loaded_state.get('search_results', []))} existing results."
f"Resuming with {len(loaded_state['research_plan'])} plan categories "
f"and {len(loaded_state.get('search_results', []))} existing results. "
f"Next task: Cat {initial_state['current_category_index']}, Task {initial_state['current_task_index_in_category']}"
)
initial_state["topic"] = (
topic # Allow overriding topic even when resuming? Or use stored topic? Let's use new one.
@@ -1057,7 +1154,6 @@ class DeepResearchAgent:
logger.warning(
f"Resume requested for {task_id}, but no previous plan found. Starting fresh."
)
initial_state["current_step_index"] = 0
# --- Execute Graph using ainvoke ---
final_state = None

View File

@@ -1,3 +1,5 @@
import os
import gradio as gr
import logging
from gradio.components import Component
@@ -56,7 +58,7 @@ def create_browser_settings_tab(webui_manager: WebuiManager):
)
keep_browser_open = gr.Checkbox(
label="Keep Browser Open",
value=True,
value=os.getenv("KEEP_BROWSER_OPEN", True),
info="Keep Browser Open between Tasks",
interactive=True
)
@@ -91,6 +93,7 @@ def create_browser_settings_tab(webui_manager: WebuiManager):
with gr.Row():
cdp_url = gr.Textbox(
label="CDP URL",
value=os.getenv("BROWSER_CDP", None),
info="CDP URL for browser remote debugging",
interactive=True,
)

View File

@@ -13,7 +13,7 @@ from browser_use.agent.views import (
AgentOutput,
)
from browser_use.browser.browser import BrowserConfig
from browser_use.browser.context import BrowserContext, BrowserContextWindowSize, BrowserContextConfig
from browser_use.browser.context import BrowserContext, BrowserContextConfig
from browser_use.browser.views import BrowserState
from gradio.components import Component
from langchain_core.language_models.chat_models import BaseChatModel
@@ -451,20 +451,16 @@ async def run_agent_task(
if not webui_manager.bu_browser:
logger.info("Launching new browser instance.")
extra_args = [f"--window-size={window_w},{window_h}"]
if browser_user_data_dir:
extra_args.append(f"--user-data-dir={browser_user_data_dir}")
if use_own_browser:
browser_binary_path = (
os.getenv("CHROME_PATH", None) or browser_binary_path
)
browser_binary_path = os.getenv("BROWSER_PATH", None) or browser_binary_path
if browser_binary_path == "":
browser_binary_path = None
chrome_user_data = os.getenv("CHROME_USER_DATA", None)
if chrome_user_data:
extra_args += [f"--user-data-dir={chrome_user_data}"]
browser_user_data = browser_user_data_dir or os.getenv("BROWSER_USER_DATA", None)
if browser_user_data:
extra_args += [f"--user-data-dir={browser_user_data}"]
else:
browser_binary_path = None
webui_manager.bu_browser = CustomBrowser(
config=BrowserConfig(
headless=headless,
@@ -485,7 +481,8 @@ async def run_agent_task(
if save_recording_path
else None,
save_downloads_path=save_download_path if save_download_path else None,
browser_window_size=BrowserContextWindowSize(width=window_w, height=window_h),
window_height=window_h,
window_width=window_w,
)
if not webui_manager.bu_browser:
raise ValueError("Browser not initialized, cannot create context.")

View File

@@ -66,8 +66,8 @@ startsecs=3
depends_on=x11vnc
[program:persistent_browser]
environment=START_URL="data:text/html,<html><body><h1>Browser Ready</h1></body></html>"
command=bash -c "mkdir -p /app/data/chrome_data && sleep 8 && $(find /ms-playwright/chromium-*/chrome-linux -name chrome) --user-data-dir=/app/data/chrome_data --window-position=0,0 --window-size=%(ENV_RESOLUTION_WIDTH)s,%(ENV_RESOLUTION_HEIGHT)s --start-maximized --no-sandbox --disable-dev-shm-usage --disable-gpu --disable-software-rasterizer --disable-setuid-sandbox --no-first-run --no-default-browser-check --no-experiments --ignore-certificate-errors --remote-debugging-port=9222 --remote-debugging-address=0.0.0.0 \"$START_URL\""
environment=START_URL="data:text/html,<html><body><h1>Browser Ready</h1></body></html>",BROWSER_USER_DATA="/app/data/chrome_data",BROWSER_DEBUGGING_PORT="%(ENV_BROWSER_DEBUGGING_PORT)s",BROWSER_DEBUGGING_HOST="%(ENV_BROWSER_DEBUGGING_HOST)s"
command=bash -c "mkdir -p %(ENV_BROWSER_USER_DATA)s && sleep 8 && $(find $PLAYWRIGHT_BROWSERS_PATH/chrome-*/chrome-linux -name chrome || find /root/.cache/ms-playwright/chrome-*/chrome-linux -name chrome || find /opt/google/chrome -name chrome || echo \"/usr/bin/google-chrome-stable\") --user-data-dir=%(ENV_BROWSER_USER_DATA)s --window-position=0,0 --window-size=%(ENV_RESOLUTION_WIDTH)s,%(ENV_RESOLUTION_HEIGHT)s --start-maximized --no-sandbox --disable-dev-shm-usage --disable-gpu --disable-software-rasterizer --disable-setuid-sandbox --no-first-run --no-default-browser-check --no-experiments --ignore-certificate-errors --remote-debugging-port=%(ENV_BROWSER_DEBUGGING_PORT)s --remote-debugging-address=%(ENV_BROWSER_DEBUGGING_HOST)s --enable-features=NetworkService,NetworkServiceInProcess --disable-features=ImprovedCookieControls \"$START_URL\""
autorestart=true
stdout_logfile=/dev/stdout
stdout_logfile_maxbytes=0
@@ -93,4 +93,4 @@ startretries=3
startsecs=3
stopsignal=TERM
stopwaitsecs=10
depends_on=persistent_browser
depends_on=persistent_browser

View File

@@ -20,8 +20,7 @@ from src.utils import utils
async def test_browser_use_agent():
from browser_use.browser.browser import Browser, BrowserConfig
from browser_use.browser.context import (
BrowserContextConfig,
BrowserContextWindowSize,
BrowserContextConfig
)
from browser_use.agent.service import Agent
@@ -38,12 +37,12 @@ async def test_browser_use_agent():
# api_key=os.getenv("OPENAI_API_KEY", ""),
# )
# llm = utils.get_llm_model(
# provider="google",
# model_name="gemini-2.0-flash",
# temperature=0.6,
# api_key=os.getenv("GOOGLE_API_KEY", "")
# )
llm = llm_provider.get_llm_model(
provider="google",
model_name="gemini-2.0-flash",
temperature=0.6,
api_key=os.getenv("GOOGLE_API_KEY", "")
)
# llm = utils.get_llm_model(
# provider="deepseek",
@@ -67,13 +66,13 @@ async def test_browser_use_agent():
window_w, window_h = 1280, 1100
llm = llm_provider.get_llm_model(
provider="azure_openai",
model_name="gpt-4o",
temperature=0.5,
base_url=os.getenv("AZURE_OPENAI_ENDPOINT", ""),
api_key=os.getenv("AZURE_OPENAI_API_KEY", ""),
)
# llm = llm_provider.get_llm_model(
# provider="azure_openai",
# model_name="gpt-4o",
# temperature=0.5,
# base_url=os.getenv("AZURE_OPENAI_ENDPOINT", ""),
# api_key=os.getenv("AZURE_OPENAI_API_KEY", ""),
# )
mcp_server_config = {
"mcpServers": {
@@ -98,7 +97,6 @@ async def test_browser_use_agent():
controller = CustomController()
await controller.setup_mcp_client(mcp_server_config)
use_own_browser = True
disable_security = False
use_vision = True # Set to False when using DeepSeek
max_actions_per_step = 10
@@ -106,33 +104,30 @@ async def test_browser_use_agent():
browser_context = None
try:
extra_chromium_args = [f"--window-size={window_w},{window_h}"]
extra_browser_args = [f"--window-size={window_w},{window_h}"]
if use_own_browser:
chrome_path = os.getenv("CHROME_PATH", None)
if chrome_path == "":
chrome_path = None
chrome_user_data = os.getenv("CHROME_USER_DATA", None)
if chrome_user_data:
extra_chromium_args += [f"--user-data-dir={chrome_user_data}"]
browser_binary_path = os.getenv("BROWSER_PATH", None)
if browser_binary_path == "":
browser_binary_path = None
browser_user_data = os.getenv("BROWSER_USER_DATA", None)
if browser_user_data:
extra_browser_args += [f"--user-data-dir={browser_user_data}"]
else:
chrome_path = None
browser_binary_path = None
browser = CustomBrowser(
config=BrowserConfig(
headless=False,
disable_security=disable_security,
browser_binary_path=chrome_path,
extra_browser_args=extra_chromium_args,
browser_binary_path=browser_binary_path,
extra_browser_args=extra_browser_args,
)
)
browser_context = await browser.new_context(
config=BrowserContextConfig(
trace_path="./tmp/traces",
save_recording_path="./tmp/record_videos",
trace_path=None,
save_recording_path=None,
save_downloads_path="./tmp/downloads",
browser_window_size=BrowserContextWindowSize(
width=window_w, height=window_h
),
force_new_context=True
window_height=window_h,
window_width=window_w,
)
)
agent = BrowserUseAgent(
@@ -167,17 +162,9 @@ async def test_browser_use_agent():
async def test_browser_use_parallel():
from browser_use.browser.context import BrowserContextWindowSize
from browser_use.browser.browser import BrowserConfig
from patchright.async_api import async_playwright
from browser_use.browser.browser import Browser
from src.browser.custom_context import BrowserContextConfig
from src.controller.custom_controller import CustomController
from browser_use.browser.browser import Browser, BrowserConfig
from browser_use.browser.context import (
BrowserContextConfig,
BrowserContextWindowSize,
)
from browser_use.agent.service import Agent
@@ -261,8 +248,7 @@ async def test_browser_use_parallel():
}
controller = CustomController()
await controller.setup_mcp_client(mcp_server_config)
use_own_browser = False
disable_security = False
use_own_browser = True
use_vision = True # Set to False when using DeepSeek
max_actions_per_step = 10
@@ -270,32 +256,30 @@ async def test_browser_use_parallel():
browser_context = None
try:
extra_chromium_args = [f"--window-size={window_w},{window_h}"]
extra_browser_args = [f"--window-size={window_w},{window_h}"]
if use_own_browser:
chrome_path = os.getenv("CHROME_PATH", None)
if chrome_path == "":
chrome_path = None
chrome_user_data = os.getenv("CHROME_USER_DATA", None)
if chrome_user_data:
extra_chromium_args += [f"--user-data-dir={chrome_user_data}"]
browser_binary_path = os.getenv("BROWSER_PATH", None)
if browser_binary_path == "":
browser_binary_path = None
browser_user_data = os.getenv("BROWSER_USER_DATA", None)
if browser_user_data:
extra_browser_args += [f"--user-data-dir={browser_user_data}"]
else:
chrome_path = None
browser_binary_path = None
browser = CustomBrowser(
config=BrowserConfig(
headless=False,
disable_security=disable_security,
browser_binary_path=chrome_path,
extra_browser_args=extra_chromium_args,
browser_binary_path=browser_binary_path,
extra_browser_args=extra_browser_args,
)
)
browser_context = await browser.new_context(
config=BrowserContextConfig(
trace_path="./tmp/traces",
save_recording_path="./tmp/record_videos",
trace_path=None,
save_recording_path=None,
save_downloads_path="./tmp/downloads",
browser_window_size=BrowserContextWindowSize(
width=window_w, height=window_h
),
window_height=window_h,
window_width=window_w,
force_new_context=True
)
)
@@ -364,7 +348,7 @@ async def test_deep_research_agent():
browser_config = {"headless": False, "window_width": 1280, "window_height": 1100, "use_own_browser": False}
agent = DeepResearchAgent(llm=llm, browser_config=browser_config, mcp_server_config=mcp_server_config)
research_topic = "Give me a detailed travel plan to Switzerland from June 1st to 10th."
research_topic = "Give me investment advices of nvidia and tesla."
task_id_to_resume = "" # Set this to resume a previous task ID
print(f"Starting research on: {research_topic}")
@@ -405,6 +389,6 @@ async def test_deep_research_agent():
if __name__ == "__main__":
# asyncio.run(test_browser_use_agent())
asyncio.run(test_browser_use_agent())
# asyncio.run(test_browser_use_parallel())
asyncio.run(test_deep_research_agent())
# asyncio.run(test_deep_research_agent())