diff --git a/.env.example b/.env.example index ad0bc6a..2e007b2 100644 --- a/.env.example +++ b/.env.example @@ -40,14 +40,14 @@ ANONYMIZED_TELEMETRY=false # LogLevel: Set to debug to enable verbose logging, set to result to get results only. Available: result | debug | info BROWSER_USE_LOGGING_LEVEL=info -# Chrome settings -CHROME_PATH= -CHROME_USER_DATA= -CHROME_DEBUGGING_PORT=9222 -CHROME_DEBUGGING_HOST=localhost +# Browser settings +BROWSER_PATH= +BROWSER_USER_DATA= +BROWSER_DEBUGGING_PORT=9222 +BROWSER_DEBUGGING_HOST=localhost # Set to true to keep browser open between AI tasks -CHROME_PERSISTENT_SESSION=false -CHROME_CDP= +KEEP_BROWSER_OPEN=true +BROWSER_CDP= # Display settings # Format: WIDTHxHEIGHTxDEPTH RESOLUTION=1920x1080x24 diff --git a/Dockerfile b/Dockerfile index 7b6d39f..b4d6fa1 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,5 +1,8 @@ FROM python:3.11-slim +# Set platform for multi-arch builds (Docker Buildx will set this) +ARG TARGETPLATFORM + # Install system dependencies RUN apt-get update && apt-get install -y \ wget \ @@ -28,7 +31,6 @@ RUN apt-get update && apt-get install -y \ fonts-liberation \ dbus \ xauth \ - xvfb \ x11vnc \ tigervnc-tools \ supervisor \ @@ -47,33 +49,45 @@ RUN git clone https://github.com/novnc/noVNC.git /opt/novnc \ && git clone https://github.com/novnc/websockify /opt/novnc/utils/websockify \ && ln -s /opt/novnc/vnc.html /opt/novnc/index.html -# Set platform for ARM64 compatibility -ARG TARGETPLATFORM=linux/amd64 - # Set up working directory WORKDIR /app # Copy requirements and install Python dependencies COPY requirements.txt . -RUN pip install --no-cache-dir -r requirements.txt +# Ensure 'patchright' is in your requirements.txt or install it directly +# RUN pip install --no-cache-dir -r requirements.txt patchright # If not in requirements +RUN pip install --no-cache-dir -r requirements.txt # Assuming patchright is in requirements.txt +RUN pip install --no-cache-dir patchright # Or install it explicitly -# Install Playwright and browsers with system dependencies -ENV PLAYWRIGHT_BROWSERS_PATH=/ms-playwright -RUN playwright install --with-deps chromium -RUN playwright install-deps +# Install Patchright browsers and dependencies +# Patchright documentation suggests PLAYWRIGHT_BROWSERS_PATH is still relevant +# or that Patchright installs to a similar default location that Playwright would. +# Let's assume Patchright respects PLAYWRIGHT_BROWSERS_PATH or its default install location is findable. +ENV PLAYWRIGHT_BROWSERS_PATH=/ms-browsers +RUN mkdir -p $PLAYWRIGHT_BROWSERS_PATH + +# Install recommended: Google Chrome (instead of just Chromium for better undetectability) +# The 'patchright install chrome' command might download and place it. +# The '--with-deps' equivalent for patchright install is to run 'patchright install-deps chrome' after. +RUN patchright install chrome +RUN patchright install-deps chrome + +# Alternative: Install Chromium if Google Chrome is problematic in certain environments +RUN patchright install chromium +RUN patchright install-deps chromium # Copy the application code COPY . . -# Set environment variables +# Set environment variables (Updated Names) ENV PYTHONUNBUFFERED=1 ENV BROWSER_USE_LOGGING_LEVEL=info -ENV CHROME_PATH=/ms-playwright/chromium-*/chrome-linux/chrome +# BROWSER_PATH will be determined by Patchright installation, supervisord will find it. ENV ANONYMIZED_TELEMETRY=false ENV DISPLAY=:99 ENV RESOLUTION=1920x1080x24 -ENV VNC_PASSWORD=vncpassword -ENV CHROME_PERSISTENT_SESSION=true +ENV VNC_PASSWORD=youvncpassword +ENV KEEP_BROWSER_OPEN=true ENV RESOLUTION_WIDTH=1920 ENV RESOLUTION_HEIGHT=1080 @@ -81,6 +95,6 @@ ENV RESOLUTION_HEIGHT=1080 RUN mkdir -p /var/log/supervisor COPY supervisord.conf /etc/supervisor/conf.d/supervisord.conf -EXPOSE 7788 6080 5901 +EXPOSE 7788 6080 5901 9222 -CMD ["/usr/bin/supervisord", "-c", "/etc/supervisor/conf.d/supervisord.conf"] +CMD ["/usr/bin/supervisord", "-c", "/etc/supervisor/conf.d/supervisord.conf"] \ No newline at end of file diff --git a/Dockerfile.arm64 b/Dockerfile.arm64 deleted file mode 100644 index 6d7a3ff..0000000 --- a/Dockerfile.arm64 +++ /dev/null @@ -1,85 +0,0 @@ -FROM python:3.11-slim - -# Install system dependencies -RUN apt-get update && apt-get install -y \ - wget \ - gnupg \ - curl \ - unzip \ - xvfb \ - libgconf-2-4 \ - libxss1 \ - libnss3 \ - libnspr4 \ - libasound2 \ - libatk1.0-0 \ - libatk-bridge2.0-0 \ - libcups2 \ - libdbus-1-3 \ - libdrm2 \ - libgbm1 \ - libgtk-3-0 \ - libxcomposite1 \ - libxdamage1 \ - libxfixes3 \ - libxrandr2 \ - xdg-utils \ - fonts-liberation \ - dbus \ - xauth \ - xvfb \ - x11vnc \ - tigervnc-tools \ - supervisor \ - net-tools \ - procps \ - git \ - python3-numpy \ - fontconfig \ - fonts-dejavu \ - fonts-dejavu-core \ - fonts-dejavu-extra \ - && rm -rf /var/lib/apt/lists/* - -# Install noVNC -RUN git clone https://github.com/novnc/noVNC.git /opt/novnc \ - && git clone https://github.com/novnc/websockify /opt/novnc/utils/websockify \ - && ln -s /opt/novnc/vnc.html /opt/novnc/index.html - -# Set platform explicitly for ARM64 -ARG TARGETPLATFORM=linux/arm64 - -# Set up working directory -WORKDIR /app - -# Copy requirements and install Python dependencies -COPY requirements.txt . -RUN pip install --no-cache-dir -r requirements.txt - -# Install Playwright and browsers with system dependencies optimized for ARM64 -ENV PLAYWRIGHT_BROWSERS_PATH=/ms-playwright -RUN PLAYWRIGHT_SKIP_BROWSER_DOWNLOAD=1 pip install playwright && \ - playwright install --with-deps chromium - -# Copy the application code -COPY . . - -# Set environment variables -ENV PYTHONUNBUFFERED=1 -ENV BROWSER_USE_LOGGING_LEVEL=info -ENV CHROME_PATH=/ms-playwright/chromium-*/chrome-linux/chrome -ENV ANONYMIZED_TELEMETRY=false -ENV DISPLAY=:99 -ENV RESOLUTION=1920x1080x24 -ENV VNC_PASSWORD=vncpassword -ENV CHROME_PERSISTENT_SESSION=true -ENV RESOLUTION_WIDTH=1920 -ENV RESOLUTION_HEIGHT=1080 - -# Set up supervisor configuration -RUN mkdir -p /var/log/supervisor -COPY supervisord.conf /etc/supervisor/conf.d/supervisord.conf - -EXPOSE 7788 6080 5901 - -CMD ["/usr/bin/supervisord", "-c", "/etc/supervisor/conf.d/supervisord.conf"] \ No newline at end of file diff --git a/README.md b/README.md index 91fb7fa..238dd40 100644 --- a/README.md +++ b/README.md @@ -23,10 +23,6 @@ We would like to officially thank [WarmShao](https://github.com/warmshao) for hi ## Installation Guide -### Prerequisites -- Python 3.11 or higher -- Git (for cloning the repository) - ### Option 1: Local Installation Read the [quickstart guide](https://docs.browser-use.com/quickstart#prepare-the-environment) or follow the steps below to get started. @@ -65,10 +61,13 @@ Install Python packages: uv pip install -r requirements.txt ``` -Install Browsers in Playwright: -You can install specific browsers by running: +Install Browsers in Patchright. ```bash -patchright install chromium +patchright install +``` +Or you can install specific browsers by running: +```bash +patchright install chromium --with-deps --no-shell ``` #### Step 4: Configure Environment @@ -83,6 +82,42 @@ cp .env.example .env ``` 2. Open `.env` in your preferred text editor and add your API keys and other settings +#### Local Setup +1. **Run the WebUI:** + After completing the installation steps above, start the application: + ```bash + python webui.py --ip 127.0.0.1 --port 7788 + ``` +2. WebUI options: + - `--ip`: The IP address to bind the WebUI to. Default is `127.0.0.1`. + - `--port`: The port to bind the WebUI to. Default is `7788`. + - `--theme`: The theme for the user interface. Default is `Ocean`. + - **Default**: The standard theme with a balanced design. + - **Soft**: A gentle, muted color scheme for a relaxed viewing experience. + - **Monochrome**: A grayscale theme with minimal color for simplicity and focus. + - **Glass**: A sleek, semi-transparent design for a modern appearance. + - **Origin**: A classic, retro-inspired theme for a nostalgic feel. + - **Citrus**: A vibrant, citrus-inspired palette with bright and fresh colors. + - **Ocean** (default): A blue, ocean-inspired theme providing a calming effect. + - `--dark-mode`: Enables dark mode for the user interface. +3. **Access the WebUI:** Open your web browser and navigate to `http://127.0.0.1:7788`. +4. **Using Your Own Browser(Optional):** + - Set `CHROME_PATH` to the executable path of your browser and `CHROME_USER_DATA` to the user data directory of your browser. Leave `CHROME_USER_DATA` empty if you want to use local user data. + - Windows + ```env + CHROME_PATH="C:\Program Files\Google\Chrome\Application\chrome.exe" + CHROME_USER_DATA="C:\Users\YourUsername\AppData\Local\Google\Chrome\User Data" + ``` + > Note: Replace `YourUsername` with your actual Windows username for Windows systems. + - Mac + ```env + CHROME_PATH="/Applications/Google Chrome.app/Contents/MacOS/Google Chrome" + CHROME_USER_DATA="/Users/YourUsername/Library/Application Support/Google/Chrome" + ``` + - Close all Chrome windows + - Open the WebUI in a non-Chrome browser, such as Firefox or Edge. This is important because the persistent browser context will use the Chrome data when running the agent. + - Check the "Use Own Browser" option within the Browser Settings. + ### Option 2: Docker Installation #### Prerequisites @@ -118,95 +153,12 @@ docker compose up --build CHROME_PERSISTENT_SESSION=true docker compose up --build ``` - 4. Access the Application: - Web Interface: Open `http://localhost:7788` in your browser - VNC Viewer (for watching browser interactions): Open `http://localhost:6080/vnc.html` - Default VNC password: "youvncpassword" - Can be changed by setting `VNC_PASSWORD` in your `.env` file -## Usage - -### Local Setup -1. **Run the WebUI:** - After completing the installation steps above, start the application: - ```bash - python webui.py --ip 127.0.0.1 --port 7788 - ``` -2. WebUI options: - - `--ip`: The IP address to bind the WebUI to. Default is `127.0.0.1`. - - `--port`: The port to bind the WebUI to. Default is `7788`. - - `--theme`: The theme for the user interface. Default is `Ocean`. - - **Default**: The standard theme with a balanced design. - - **Soft**: A gentle, muted color scheme for a relaxed viewing experience. - - **Monochrome**: A grayscale theme with minimal color for simplicity and focus. - - **Glass**: A sleek, semi-transparent design for a modern appearance. - - **Origin**: A classic, retro-inspired theme for a nostalgic feel. - - **Citrus**: A vibrant, citrus-inspired palette with bright and fresh colors. - - **Ocean** (default): A blue, ocean-inspired theme providing a calming effect. - - `--dark-mode`: Enables dark mode for the user interface. -3. **Access the WebUI:** Open your web browser and navigate to `http://127.0.0.1:7788`. -4. **Using Your Own Browser(Optional):** - - Set `CHROME_PATH` to the executable path of your browser and `CHROME_USER_DATA` to the user data directory of your browser. Leave `CHROME_USER_DATA` empty if you want to use local user data. - - Windows - ```env - CHROME_PATH="C:\Program Files\Google\Chrome\Application\chrome.exe" - CHROME_USER_DATA="C:\Users\YourUsername\AppData\Local\Google\Chrome\User Data" - ``` - > Note: Replace `YourUsername` with your actual Windows username for Windows systems. - - Mac - ```env - CHROME_PATH="/Applications/Google Chrome.app/Contents/MacOS/Google Chrome" - CHROME_USER_DATA="/Users/YourUsername/Library/Application Support/Google/Chrome" - ``` - - Close all Chrome windows - - Open the WebUI in a non-Chrome browser, such as Firefox or Edge. This is important because the persistent browser context will use the Chrome data when running the agent. - - Check the "Use Own Browser" option within the Browser Settings. -5. **Keep Browser Open(Optional):** - - Set `CHROME_PERSISTENT_SESSION=true` in the `.env` file. - -### Docker Setup -1. **Environment Variables:** - - All configuration is done through the `.env` file - - Available environment variables: - ``` - # LLM API Keys - OPENAI_API_KEY=your_key_here - ANTHROPIC_API_KEY=your_key_here - GOOGLE_API_KEY=your_key_here - - # Browser Settings - CHROME_PERSISTENT_SESSION=true # Set to true to keep browser open between AI tasks - RESOLUTION=1920x1080x24 # Custom resolution format: WIDTHxHEIGHTxDEPTH - RESOLUTION_WIDTH=1920 # Custom width in pixels - RESOLUTION_HEIGHT=1080 # Custom height in pixels - - # VNC Settings - VNC_PASSWORD=your_vnc_password # Optional, defaults to "vncpassword" - ``` - -2. **Platform Support:** - - Supports both AMD64 and ARM64 architectures - - For ARM64 systems (e.g., Apple Silicon Macs), the container will automatically use the appropriate image - -3. **Browser Persistence Modes:** - - **Default Mode (CHROME_PERSISTENT_SESSION=false):** - - Browser opens and closes with each AI task - - Clean state for each interaction - - Lower resource usage - - - **Persistent Mode (CHROME_PERSISTENT_SESSION=true):** - - Browser stays open between AI tasks - - Maintains history and state - - Allows viewing previous AI interactions - - Set in `.env` file or via environment variable when starting container - -4. **Viewing Browser Interactions:** - - Access the noVNC viewer at `http://localhost:6080/vnc.html` - - Enter the VNC password (default: "vncpassword" or what you set in VNC_PASSWORD) - - Direct VNC access available on port 5900 (mapped to container port 5901) - - You can now see all browser interactions in real-time - 5. **Container Management:** ```bash # Start with persistent browser diff --git a/docker-compose.yml b/docker-compose.yml index 75b0fd0..780d2a9 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -1,62 +1,76 @@ services: browser-use-webui: - platform: linux/amd64 build: context: . - dockerfile: ${DOCKERFILE:-Dockerfile} + dockerfile: Dockerfile args: TARGETPLATFORM: ${TARGETPLATFORM:-linux/amd64} ports: - - "7788:7788" # Gradio default port - - "6080:6080" # noVNC web interface - - "5901:5901" # VNC port - - "9222:9222" # Chrome remote debugging port + - "7788:7788" + - "6080:6080" + - "5901:5901" + - "9222:9222" environment: + # LLM API Keys & Endpoints (Your existing list) - OPENAI_ENDPOINT=${OPENAI_ENDPOINT:-https://api.openai.com/v1} - OPENAI_API_KEY=${OPENAI_API_KEY:-} - - ANTHROPIC_API_KEY=${ANTHROPIC_API_KEY:-} - ANTHROPIC_ENDPOINT=${ANTHROPIC_ENDPOINT:-https://api.anthropic.com} + - ANTHROPIC_API_KEY=${ANTHROPIC_API_KEY:-} - GOOGLE_API_KEY=${GOOGLE_API_KEY:-} - AZURE_OPENAI_ENDPOINT=${AZURE_OPENAI_ENDPOINT:-} - AZURE_OPENAI_API_KEY=${AZURE_OPENAI_API_KEY:-} + - AZURE_OPENAI_API_VERSION=${AZURE_OPENAI_API_VERSION:-2025-01-01-preview} - DEEPSEEK_ENDPOINT=${DEEPSEEK_ENDPOINT:-https://api.deepseek.com} - DEEPSEEK_API_KEY=${DEEPSEEK_API_KEY:-} - OLLAMA_ENDPOINT=${OLLAMA_ENDPOINT:-http://localhost:11434} - - MISTRAL_API_KEY=${MISTRAL_API_KEY:-} - MISTRAL_ENDPOINT=${MISTRAL_ENDPOINT:-https://api.mistral.ai/v1} + - MISTRAL_API_KEY=${MISTRAL_API_KEY:-} - ALIBABA_ENDPOINT=${ALIBABA_ENDPOINT:-https://dashscope.aliyuncs.com/compatible-mode/v1} - ALIBABA_API_KEY=${ALIBABA_API_KEY:-} - MOONSHOT_ENDPOINT=${MOONSHOT_ENDPOINT:-https://api.moonshot.cn/v1} - MOONSHOT_API_KEY=${MOONSHOT_API_KEY:-} - - IBM_API_KEY=${IBM_API_KEY:-} + - UNBOUND_ENDPOINT=${UNBOUND_ENDPOINT:-https://api.getunbound.ai} + - UNBOUND_API_KEY=${UNBOUND_API_KEY:-} + - SiliconFLOW_ENDPOINT=${SiliconFLOW_ENDPOINT:-https://api.siliconflow.cn/v1/} + - SiliconFLOW_API_KEY=${SiliconFLOW_API_KEY:-} - IBM_ENDPOINT=${IBM_ENDPOINT:-https://us-south.ml.cloud.ibm.com} + - IBM_API_KEY=${IBM_API_KEY:-} - IBM_PROJECT_ID=${IBM_PROJECT_ID:-} - - BROWSER_USE_LOGGING_LEVEL=${BROWSER_USE_LOGGING_LEVEL:-info} + + # Application Settings - ANONYMIZED_TELEMETRY=${ANONYMIZED_TELEMETRY:-false} - - CHROME_PATH=/usr/bin/google-chrome - - CHROME_USER_DATA=/app/data/chrome_data - - CHROME_PERSISTENT_SESSION=${CHROME_PERSISTENT_SESSION:-false} - - CHROME_CDP=${CHROME_CDP:-http://localhost:9222} + - BROWSER_USE_LOGGING_LEVEL=${BROWSER_USE_LOGGING_LEVEL:-info} + + # Browser Settings + - BROWSER_USER_DATA=${BROWSER_USER_DATA:-/app/data/chrome_data} + - BROWSER_DEBUGGING_PORT=${BROWSER_DEBUGGING_PORT:-9222} + - BROWSER_DEBUGGING_HOST=${BROWSER_DEBUGGING_HOST:-0.0.0.0} + - KEEP_BROWSER_OPEN=${KEEP_BROWSER_OPEN:-true} + - BROWSER_CDP=${BROWSER_CDP:-} # e.g., http://localhost:9222 + + # Display Settings - DISPLAY=:99 - - PLAYWRIGHT_BROWSERS_PATH=/ms-playwright + # This ENV is used by the Dockerfile during build time if Patchright respects it. + # It's not strictly needed at runtime by docker-compose unless your app or scripts also read it. + - PLAYWRIGHT_BROWSERS_PATH=/ms-browsers # Matches Dockerfile ENV - RESOLUTION=${RESOLUTION:-1920x1080x24} - RESOLUTION_WIDTH=${RESOLUTION_WIDTH:-1920} - RESOLUTION_HEIGHT=${RESOLUTION_HEIGHT:-1080} - - VNC_PASSWORD=${VNC_PASSWORD:-vncpassword} - - CHROME_DEBUGGING_PORT=9222 - - CHROME_DEBUGGING_HOST=localhost + + # VNC Settings + - VNC_PASSWORD=${VNC_PASSWORD:-youvncpassword} + volumes: - /tmp/.X11-unix:/tmp/.X11-unix + # - ./my_chrome_data:/app/data/chrome_data # Optional: persist browser data restart: unless-stopped shm_size: '2gb' cap_add: - SYS_ADMIN - security_opt: - - seccomp=unconfined tmpfs: - /tmp healthcheck: - test: ["CMD", "nc", "-z", "localhost", "5901"] + test: ["CMD", "nc", "-z", "localhost", "5901"] # VNC port interval: 10s timeout: 5s - retries: 3 + retries: 3 \ No newline at end of file diff --git a/entrypoint.sh b/entrypoint.sh deleted file mode 100644 index 9ab9240..0000000 --- a/entrypoint.sh +++ /dev/null @@ -1,4 +0,0 @@ -#!/bin/bash - -# Start supervisord in the foreground to properly manage child processes -exec /usr/bin/supervisord -n -c /etc/supervisor/conf.d/supervisord.conf \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index 4762a7e..bc8de8c 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,4 @@ -browser-use==0.1.43 +browser-use==0.1.45 pyperclip==1.9.0 gradio==5.27.0 json-repair diff --git a/src/agent/browser_use/browser_use_agent.py b/src/agent/browser_use/browser_use_agent.py index 49d671f..d5cba0f 100644 --- a/src/agent/browser_use/browser_use_agent.py +++ b/src/agent/browser_use/browser_use_agent.py @@ -20,6 +20,7 @@ from browser_use.telemetry.views import ( ) from browser_use.utils import time_execution_async from dotenv import load_dotenv +from browser_use.agent.message_manager.utils import is_model_without_tool_support load_dotenv() logger = logging.getLogger(__name__) @@ -30,6 +31,22 @@ SKIP_LLM_API_KEY_VERIFICATION = ( class BrowserUseAgent(Agent): + def _set_tool_calling_method(self) -> ToolCallingMethod | None: + tool_calling_method = self.settings.tool_calling_method + if tool_calling_method == 'auto': + if is_model_without_tool_support(self.model_name): + return 'raw' + elif self.chat_model_library == 'ChatGoogleGenerativeAI': + return None + elif self.chat_model_library == 'ChatOpenAI': + return 'function_calling' + elif self.chat_model_library == 'AzureChatOpenAI': + return 'function_calling' + else: + return None + else: + return tool_calling_method + @time_execution_async("--run (agent)") async def run( self, max_steps: int = 100, on_step_start: AgentHookFunc | None = None, diff --git a/src/agent/deep_research/deep_research_agent.py b/src/agent/deep_research/deep_research_agent.py index 8f95eec..b7a7a56 100644 --- a/src/agent/deep_research/deep_research_agent.py +++ b/src/agent/deep_research/deep_research_agent.py @@ -29,7 +29,7 @@ from langchain_core.tools import StructuredTool, Tool from langgraph.graph import StateGraph from pydantic import BaseModel, Field -from browser_use.browser.context import BrowserContextWindowSize, BrowserContextConfig +from browser_use.browser.context import BrowserContextConfig from src.agent.browser_use.browser_use_agent import BrowserUseAgent from src.browser.custom_browser import CustomBrowser @@ -82,22 +82,19 @@ async def run_single_browser_task( try: logger.info(f"Starting browser task for query: {task_query}") extra_args = [f"--window-size={window_w},{window_h}"] - if browser_user_data_dir: - extra_args.append(f"--user-data-dir={browser_user_data_dir}") if use_own_browser: - browser_binary_path = os.getenv("CHROME_PATH", None) or browser_binary_path + browser_binary_path = os.getenv("BROWSER_PATH", None) or browser_binary_path if browser_binary_path == "": browser_binary_path = None - chrome_user_data = os.getenv("CHROME_USER_DATA", None) - if chrome_user_data: - extra_args += [f"--user-data-dir={chrome_user_data}"] + browser_user_data = browser_user_data_dir or os.getenv("BROWSER_USER_DATA", None) + if browser_user_data: + extra_args += [f"--user-data-dir={browser_user_data}"] else: browser_binary_path = None bu_browser = CustomBrowser( config=BrowserConfig( headless=headless, - disable_security=False, browser_binary_path=browser_binary_path, extra_browser_args=extra_args, wss_url=wss_url, @@ -107,7 +104,8 @@ async def run_single_browser_task( context_config = BrowserContextConfig( save_downloads_path="./tmp/downloads", - browser_window_size=BrowserContextWindowSize(width=window_w, height=window_h), + window_height=window_h, + window_width=window_w, force_new_context=True, ) bu_browser_context = await bu_browser.new_context(config=context_config) @@ -299,30 +297,34 @@ Provide a list of distinct search queries(up to {max_parallel_browsers}) that ar # --- Langgraph State Definition --- -class ResearchPlanItem(TypedDict): - step: int - task: str +class ResearchTaskItem(TypedDict): + # step: int # Maybe step within category, or just implicit by order + task_description: str status: str # "pending", "completed", "failed" - queries: Optional[List[str]] # Queries generated for this task - result_summary: Optional[str] # Optional brief summary after execution + queries: Optional[List[str]] + result_summary: Optional[str] + + +class ResearchCategoryItem(TypedDict): + category_name: str + tasks: List[ResearchTaskItem] + # Optional: category_status: str # Could be "pending", "in_progress", "completed" class DeepResearchState(TypedDict): task_id: str topic: str - research_plan: List[ResearchPlanItem] - search_results: List[Dict[str, Any]] # Stores results from browser_search_tool_func - # messages: Sequence[BaseMessage] # History for ReAct-like steps within nodes - llm: Any # The LLM instance + research_plan: List[ResearchCategoryItem] # CHANGED + search_results: List[Dict[str, Any]] + llm: Any tools: List[Tool] output_dir: Path browser_config: Dict[str, Any] final_report: Optional[str] - current_step_index: int # To track progress through the plan - stop_requested: bool # Flag to signal termination - # Add other state variables as needed - error_message: Optional[str] # To store errors - + current_category_index: int + current_task_index_in_category: int + stop_requested: bool + error_message: Optional[str] messages: List[BaseMessage] @@ -330,44 +332,75 @@ class DeepResearchState(TypedDict): def _load_previous_state(task_id: str, output_dir: str) -> Dict[str, Any]: - """Loads state from files if they exist.""" state_updates = {} plan_file = os.path.join(output_dir, PLAN_FILENAME) search_file = os.path.join(output_dir, SEARCH_INFO_FILENAME) + + loaded_plan: List[ResearchCategoryItem] = [] + next_cat_idx, next_task_idx = 0, 0 + found_pending = False + if os.path.exists(plan_file): try: with open(plan_file, "r", encoding="utf-8") as f: - # Basic parsing, assumes markdown checklist format - plan = [] - step = 1 - for line in f: - line = line.strip() - if line.startswith(("- [x]", "- [ ]")): - status = "completed" if line.startswith("- [x]") else "pending" - task = line[5:].strip() - plan.append( - ResearchPlanItem( - step=step, - task=task, - status=status, - queries=None, - result_summary=None, - ) + current_category: Optional[ResearchCategoryItem] = None + lines = f.readlines() + cat_counter = 0 + task_counter_in_cat = 0 + + for line_num, line_content in enumerate(lines): + line = line_content.strip() + if line.startswith("## "): # Category + if current_category: # Save previous category + loaded_plan.append(current_category) + if not found_pending: # If previous category was all done, advance cat counter + cat_counter += 1 + task_counter_in_cat = 0 + category_name = line[line.find(" "):].strip() # Get text after "## X. " + current_category = ResearchCategoryItem(category_name=category_name, tasks=[]) + elif (line.startswith("- [ ]") or line.startswith("- [x]") or line.startswith( + "- [-]")) and current_category: # Task + status = "pending" + if line.startswith("- [x]"): + status = "completed" + elif line.startswith("- [-]"): + status = "failed" + + task_desc = line[5:].strip() + current_category["tasks"].append( + ResearchTaskItem(task_description=task_desc, status=status, queries=None, + result_summary=None) ) - step += 1 - state_updates["research_plan"] = plan - # Determine next step index based on loaded plan - next_step = next( - (i for i, item in enumerate(plan) if item["status"] == "pending"), - len(plan), - ) - state_updates["current_step_index"] = next_step + if status == "pending" and not found_pending: + next_cat_idx = cat_counter + next_task_idx = task_counter_in_cat + found_pending = True + if not found_pending: # only increment if previous tasks were completed/failed + task_counter_in_cat += 1 + + if current_category: # Append last category + loaded_plan.append(current_category) + + if loaded_plan: + state_updates["research_plan"] = loaded_plan + if not found_pending and loaded_plan: # All tasks were completed or failed + next_cat_idx = len(loaded_plan) # Points beyond the last category + next_task_idx = 0 + state_updates["current_category_index"] = next_cat_idx + state_updates["current_task_index_in_category"] = next_task_idx logger.info( - f"Loaded research plan from {plan_file}, next step index: {next_step}" + f"Loaded hierarchical research plan from {plan_file}. " + f"Next task: Category {next_cat_idx}, Task {next_task_idx} in category." ) + else: + logger.warning(f"Plan file {plan_file} was empty or malformed.") + except Exception as e: - logger.error(f"Failed to load or parse research plan {plan_file}: {e}") + logger.error(f"Failed to load or parse research plan {plan_file}: {e}", exc_info=True) state_updates["error_message"] = f"Failed to load research plan: {e}" + else: + logger.info(f"Plan file {plan_file} not found. Will start fresh.") + if os.path.exists(search_file): try: with open(search_file, "r", encoding="utf-8") as f: @@ -375,22 +408,25 @@ def _load_previous_state(task_id: str, output_dir: str) -> Dict[str, Any]: logger.info(f"Loaded search results from {search_file}") except Exception as e: logger.error(f"Failed to load search results {search_file}: {e}") - state_updates["error_message"] = f"Failed to load search results: {e}" - # Decide if this is fatal or if we can continue without old results + state_updates["error_message"] = ( + state_updates.get("error_message", "") + f" Failed to load search results: {e}").strip() return state_updates -def _save_plan_to_md(plan: List[ResearchPlanItem], output_dir: str): - """Saves the research plan to a markdown checklist file.""" +def _save_plan_to_md(plan: List[ResearchCategoryItem], output_dir: str): plan_file = os.path.join(output_dir, PLAN_FILENAME) try: with open(plan_file, "w", encoding="utf-8") as f: - f.write("# Research Plan\n\n") - for item in plan: - marker = "- [x]" if item["status"] == "completed" else "- [ ]" - f.write(f"{marker} {item['task']}\n") - logger.info(f"Research plan saved to {plan_file}") + f.write(f"# Research Plan\n\n") + for cat_idx, category in enumerate(plan): + f.write(f"## {cat_idx + 1}. {category['category_name']}\n\n") + for task_idx, task in enumerate(category['tasks']): + marker = "- [x]" if task["status"] == "completed" else "- [ ]" if task[ + "status"] == "pending" else "- [-]" # [-] for failed + f.write(f" {marker} {task['task_description']}\n") + f.write("\n") + logger.info(f"Hierarchical research plan saved to {plan_file}") except Exception as e: logger.error(f"Failed to save research plan to {plan_file}: {e}") @@ -419,7 +455,6 @@ def _save_report_to_md(report: str, output_dir: Path): async def planning_node(state: DeepResearchState) -> Dict[str, Any]: - """Generates the initial research plan or refines it if resuming.""" logger.info("--- Entering Planning Node ---") if state.get("stop_requested"): logger.info("Stop requested, skipping planning.") @@ -428,293 +463,344 @@ async def planning_node(state: DeepResearchState) -> Dict[str, Any]: llm = state["llm"] topic = state["topic"] existing_plan = state.get("research_plan") - existing_results = state.get("search_results") output_dir = state["output_dir"] - if existing_plan and state.get("current_step_index", 0) > 0: + if existing_plan and ( + state.get("current_category_index", 0) > 0 or state.get("current_task_index_in_category", 0) > 0): logger.info("Resuming with existing plan.") - # Maybe add logic here to let LLM review and potentially adjust the plan - # based on existing_results, but for now, we just use the loaded plan. _save_plan_to_md(existing_plan, output_dir) # Ensure it's saved initially - return {"research_plan": existing_plan} # Return the loaded plan + # current_category_index and current_task_index_in_category should be set by _load_previous_state + return {"research_plan": existing_plan} logger.info(f"Generating new research plan for topic: {topic}") - prompt = ChatPromptTemplate.from_messages( - [ - ( - "system", - """You are a meticulous research assistant. Your goal is to create a step-by-step research plan to thoroughly investigate a given topic. - The plan should consist of clear, actionable research tasks or questions. Each step should logically build towards a comprehensive understanding. - Format the output as a numbered list. Each item should represent a distinct research step or question. - Example: - 1. Define the core concepts and terminology related to [Topic]. - 2. Identify the key historical developments of [Topic]. - 3. Analyze the current state-of-the-art and recent advancements in [Topic]. - 4. Investigate the major challenges and limitations associated with [Topic]. - 5. Explore the future trends and potential applications of [Topic]. - 6. Summarize the findings and draw conclusions. + prompt_text = f"""You are a meticulous research assistant. Your goal is to create a hierarchical research plan to thoroughly investigate the topic: "{topic}". +The plan should be structured into several main research categories. Each category should contain a list of specific, actionable research tasks or questions. +Format the output as a JSON list of objects. Each object represents a research category and should have: +1. "category_name": A string for the name of the research category. +2. "tasks": A list of strings, where each string is a specific research task for that category. - Keep the plan focused and manageable. Aim for 5-10 detailed steps. - """, - ), - ("human", f"Generate a research plan for the topic: {topic}"), - ] - ) +Example JSON Output: +[ + {{ + "category_name": "Understanding Core Concepts and Definitions", + "tasks": [ + "Define the primary terminology associated with '{topic}'.", + "Identify the fundamental principles and theories underpinning '{topic}'." + ] + }}, + {{ + "category_name": "Historical Development and Key Milestones", + "tasks": [ + "Trace the historical evolution of '{topic}'.", + "Identify key figures, events, or breakthroughs in the development of '{topic}'." + ] + }}, + {{ + "category_name": "Current State-of-the-Art and Applications", + "tasks": [ + "Analyze the current advancements and prominent applications of '{topic}'.", + "Investigate ongoing research and active areas of development related to '{topic}'." + ] + }}, + {{ + "category_name": "Challenges, Limitations, and Future Outlook", + "tasks": [ + "Identify the major challenges and limitations currently facing '{topic}'.", + "Explore potential future trends, ethical considerations, and societal impacts of '{topic}'." + ] + }} +] + +Generate a plan with 3-10 categories, and 2-6 tasks per category for the topic: "{topic}" according to the complexity of the topic. +Ensure the output is a valid JSON array. +""" + messages = [ + SystemMessage(content="You are a research planning assistant outputting JSON."), + HumanMessage(content=prompt_text) + ] try: - response = await llm.ainvoke(prompt.format_prompt(topic=topic).to_messages()) - plan_text = response.content + response = await llm.ainvoke(messages) + raw_content = response.content + # The LLM might wrap the JSON in backticks + if raw_content.strip().startswith("```json"): + raw_content = raw_content.strip()[7:-3].strip() + elif raw_content.strip().startswith("```"): + raw_content = raw_content.strip()[3:-3].strip() - # Parse the numbered list into the plan structure - new_plan: List[ResearchPlanItem] = [] - for i, line in enumerate(plan_text.strip().split("\n")): - line = line.strip() - if line and (line[0].isdigit() or line.startswith(("*", "-"))): - # Simple parsing: remove number/bullet and space - task_text = ( - line.split(".", 1)[-1].strip() - if line[0].isdigit() - else line[1:].strip() - ) - if task_text: - new_plan.append( - ResearchPlanItem( - step=i + 1, - task=task_text, + logger.debug(f"LLM response for plan: {raw_content}") + parsed_plan_from_llm = json.loads(raw_content) + + new_plan: List[ResearchCategoryItem] = [] + for cat_idx, category_data in enumerate(parsed_plan_from_llm): + if not isinstance(category_data, + dict) or "category_name" not in category_data or "tasks" not in category_data: + logger.warning(f"Skipping invalid category data: {category_data}") + continue + + tasks: List[ResearchTaskItem] = [] + for task_idx, task_desc in enumerate(category_data["tasks"]): + if isinstance(task_desc, str): + tasks.append( + ResearchTaskItem( + task_description=task_desc, status="pending", queries=None, result_summary=None, ) ) + else: # Sometimes LLM puts tasks as {"task": "description"} + if isinstance(task_desc, dict) and "task_description" in task_desc: + tasks.append( + ResearchTaskItem( + task_description=task_desc["task_description"], + status="pending", + queries=None, + result_summary=None, + ) + ) + elif isinstance(task_desc, dict) and "task" in task_desc: # common LLM mistake + tasks.append( + ResearchTaskItem( + task_description=task_desc["task"], + status="pending", + queries=None, + result_summary=None, + ) + ) + else: + logger.warning( + f"Skipping invalid task data: {task_desc} in category {category_data['category_name']}") + + new_plan.append( + ResearchCategoryItem( + category_name=category_data["category_name"], + tasks=tasks, + ) + ) if not new_plan: - logger.error("LLM failed to generate a valid plan structure.") + logger.error("LLM failed to generate a valid plan structure from JSON.") return {"error_message": "Failed to generate research plan structure."} - logger.info(f"Generated research plan with {len(new_plan)} steps.") - _save_plan_to_md(new_plan, output_dir) + logger.info(f"Generated research plan with {len(new_plan)} categories.") + _save_plan_to_md(new_plan, output_dir) # Save the hierarchical plan return { "research_plan": new_plan, - "current_step_index": 0, # Start from the beginning - "search_results": [], # Initialize search results + "current_category_index": 0, + "current_task_index_in_category": 0, + "search_results": [], } + except json.JSONDecodeError as e: + logger.error(f"Failed to parse JSON from LLM for plan: {e}. Response was: {raw_content}", exc_info=True) + return {"error_message": f"LLM generated invalid JSON for research plan: {e}"} except Exception as e: logger.error(f"Error during planning: {e}", exc_info=True) return {"error_message": f"LLM Error during planning: {e}"} async def research_execution_node(state: DeepResearchState) -> Dict[str, Any]: - """ - Executes the next step in the research plan by invoking the LLM with tools. - The LLM decides which tool (e.g., browser search) to use and provides arguments. - """ logger.info("--- Entering Research Execution Node ---") if state.get("stop_requested"): logger.info("Stop requested, skipping research execution.") return { "stop_requested": True, - "current_step_index": state["current_step_index"], - } # Keep index same + "current_category_index": state["current_category_index"], + "current_task_index_in_category": state["current_task_index_in_category"], + } plan = state["research_plan"] - current_index = state["current_step_index"] + cat_idx = state["current_category_index"] + task_idx = state["current_task_index_in_category"] llm = state["llm"] - tools = state["tools"] # Tools are now passed in state + tools = state["tools"] output_dir = str(state["output_dir"]) - task_id = state["task_id"] - # Stop event is bound inside the tool function, no need to pass directly here + task_id = state["task_id"] # For _AGENT_STOP_FLAGS - if not plan or current_index >= len(plan): - logger.info("Research plan complete or empty.") - # This condition should ideally be caught by `should_continue` before reaching here - return {} + # This check should ideally be handled by `should_continue` + if not plan or cat_idx >= len(plan): + logger.info("Research plan complete or categories exhausted.") + return {} # should route to synthesis - current_step = plan[current_index] - if current_step["status"] == "completed": - logger.info(f"Step {current_step['step']} already completed, skipping.") - return {"current_step_index": current_index + 1} # Move to next step + current_category = plan[cat_idx] + if task_idx >= len(current_category["tasks"]): + logger.info(f"All tasks in category '{current_category['category_name']}' completed. Moving to next category.") + # This logic is now effectively handled by should_continue and the index updates below + # The next iteration will be caught by should_continue or this node with updated indices + return { + "current_category_index": cat_idx + 1, + "current_task_index_in_category": 0, + "messages": state["messages"] # Pass messages along + } + + current_task = current_category["tasks"][task_idx] + + if current_task["status"] == "completed": + logger.info( + f"Task '{current_task['task_description']}' in category '{current_category['category_name']}' already completed. Skipping.") + # Logic to find next task + next_task_idx = task_idx + 1 + next_cat_idx = cat_idx + if next_task_idx >= len(current_category["tasks"]): + next_cat_idx += 1 + next_task_idx = 0 + return { + "current_category_index": next_cat_idx, + "current_task_index_in_category": next_task_idx, + "messages": state["messages"] # Pass messages along + } logger.info( - f"Executing research step {current_step['step']}: {current_step['task']}" + f"Executing research task: '{current_task['task_description']}' (Category: '{current_category['category_name']}')" ) - # Bind tools to the LLM for this call llm_with_tools = llm.bind_tools(tools) - if state["messages"]: - current_task_message = [ - HumanMessage( - content=f"Research Task (Step {current_step['step']}): {current_step['task']}" - ) - ] - invocation_messages = state["messages"] + current_task_message + + # Construct messages for LLM invocation + task_prompt_content = ( + f"Current Research Category: {current_category['category_name']}\n" + f"Specific Task: {current_task['task_description']}\n\n" + "Please use the available tools, especially 'parallel_browser_search', to gather information for this specific task. " + "Provide focused search queries relevant ONLY to this task. " + "If you believe you have sufficient information from previous steps for this specific task, you can indicate that you are ready to summarize or that no further search is needed." + ) + current_task_message_history = [ + HumanMessage(content=task_prompt_content) + ] + if not state["messages"]: # First actual execution message + invocation_messages = [ + SystemMessage( + content="You are a research assistant executing one task of a research plan. Focus on the current task only."), + ] + current_task_message_history else: - current_task_message = [ - SystemMessage( - content="You are a research assistant executing one step of a research plan. Use the available tools, especially the 'parallel_browser_search' tool, to gather information needed for the current task. Be precise with your search queries if using the browser tool." - ), - HumanMessage( - content=f"Research Task (Step {current_step['step']}): {current_step['task']}" - ), - ] - invocation_messages = current_task_message + invocation_messages = state["messages"] + current_task_message_history try: - # Invoke the LLM, expecting it to make a tool call - logger.info(f"Invoking LLM with tools for task: {current_step['task']}") + logger.info(f"Invoking LLM with tools for task: {current_task['task_description']}") ai_response: BaseMessage = await llm_with_tools.ainvoke(invocation_messages) logger.info("LLM invocation complete.") tool_results = [] executed_tool_names = [] + current_search_results = state.get("search_results", []) # Get existing search results if not isinstance(ai_response, AIMessage) or not ai_response.tool_calls: - # LLM didn't call a tool. Maybe it answered directly? Or failed? logger.warning( - f"LLM did not call any tool for step {current_step['step']}. Response: {ai_response.content[:100]}..." - ) - # How to handle this? Mark step as failed? Or store the content? - # Let's mark as failed for now, assuming a tool was expected. - current_step["status"] = "failed" - current_step["result_summary"] = "LLM did not use a tool as expected." - _save_plan_to_md(plan, output_dir) - return { - "research_plan": plan, - "status": "pending", - "current_step_index": current_index, - "messages": [ - f"LLM failed to call a tool for step {current_step['step']}. Response: {ai_response.content}" - f". Please use tool to do research unless you are thinking or summary"], - } - - # Process tool calls - for tool_call in ai_response.tool_calls: - tool_name = tool_call.get("name") - tool_args = tool_call.get("args", {}) - tool_call_id = tool_call.get("id") # Important for ToolMessage - - logger.info(f"LLM requested tool call: {tool_name} with args: {tool_args}") - executed_tool_names.append(tool_name) - - # Find the corresponding tool instance - selected_tool = next((t for t in tools if t.name == tool_name), None) - - if not selected_tool: - logger.error(f"LLM called tool '{tool_name}' which is not available.") - # Create a ToolMessage indicating the error - tool_results.append( - ToolMessage( - content=f"Error: Tool '{tool_name}' not found.", - tool_call_id=tool_call_id, - ) - ) - continue # Skip to next tool call if any - - # Execute the tool - try: - # Stop check before executing the tool (tool itself also checks) - stop_event = _AGENT_STOP_FLAGS.get(task_id) - if stop_event and stop_event.is_set(): - logger.info(f"Stop requested before executing tool: {tool_name}") - current_step["status"] = "pending" # Not completed due to stop - _save_plan_to_md(plan, output_dir) - return {"stop_requested": True, "research_plan": plan} - - logger.info(f"Executing tool: {tool_name}") - # Assuming tool functions handle async correctly - tool_output = await selected_tool.ainvoke(tool_args) - logger.info(f"Tool '{tool_name}' executed successfully.") - browser_tool_called = "parallel_browser_search" in executed_tool_names - # Append result to overall search results - current_search_results = state.get("search_results", []) - if browser_tool_called: # Specific handling for browser tool output - current_search_results.extend(tool_output) - else: # Handle other tool outputs (e.g., file tools return strings) - # Store it associated with the step? Or a generic log? - # Let's just log it for now. Need better handling for diverse tool outputs. - logger.info( - f"Result from tool '{tool_name}': {str(tool_output)[:200]}..." - ) - - # Store result for potential next LLM call (if we were doing multi-turn) - tool_results.append( - ToolMessage( - content=json.dumps(tool_output), tool_call_id=tool_call_id - ) - ) - - except Exception as e: - logger.error(f"Error executing tool '{tool_name}': {e}", exc_info=True) - tool_results.append( - ToolMessage( - content=f"Error executing tool {tool_name}: {e}", - tool_call_id=tool_call_id, - ) - ) - # Also update overall state search_results with error? - current_search_results = state.get("search_results", []) - current_search_results.append( - { - "tool_name": tool_name, - "args": tool_args, - "status": "failed", - "error": str(e), - } - ) - - # Basic check: Did the browser tool run at all? (More specific checks needed) - browser_tool_called = "parallel_browser_search" in executed_tool_names - # We might need a more nuanced status based on the *content* of tool_results - step_failed = ( - any("Error:" in str(tr.content) for tr in tool_results) - or not browser_tool_called - ) - - if step_failed: - logger.warning( - f"Step {current_step['step']} failed or did not yield results via browser search." - ) - current_step["status"] = "failed" - current_step["result_summary"] = ( - f"Tool execution failed or browser tool not used. Errors: {[tr.content for tr in tool_results if 'Error' in str(tr.content)]}" + f"LLM did not call any tool for task '{current_task['task_description']}'. Response: {ai_response.content[:100]}..." ) + current_task["status"] = "pending" # Or "completed_no_tool" if LLM explains it's done + current_task["result_summary"] = f"LLM did not use a tool. Response: {ai_response.content}" + current_task["current_category_index"] = cat_idx + current_task["current_task_index_in_category"] = task_idx + return current_task + # We still save the plan and advance. else: - logger.info( - f"Step {current_step['step']} completed using tool(s): {executed_tool_names}." - ) - current_step["status"] = "completed" + # Process tool calls + for tool_call in ai_response.tool_calls: + tool_name = tool_call.get("name") + tool_args = tool_call.get("args", {}) + tool_call_id = tool_call.get("id") - current_step["result_summary"] = ( - f"Executed tool(s): {', '.join(executed_tool_names)}." - ) + logger.info(f"LLM requested tool call: {tool_name} with args: {tool_args}") + executed_tool_names.append(tool_name) + selected_tool = next((t for t in tools if t.name == tool_name), None) + if not selected_tool: + logger.error(f"LLM called tool '{tool_name}' which is not available.") + tool_results.append( + ToolMessage(content=f"Error: Tool '{tool_name}' not found.", tool_call_id=tool_call_id)) + continue + + try: + stop_event = _AGENT_STOP_FLAGS.get(task_id) + if stop_event and stop_event.is_set(): + logger.info(f"Stop requested before executing tool: {tool_name}") + current_task["status"] = "pending" # Or a new "stopped" status + _save_plan_to_md(plan, output_dir) + return {"stop_requested": True, "research_plan": plan, "current_category_index": cat_idx, + "current_task_index_in_category": task_idx} + + logger.info(f"Executing tool: {tool_name}") + tool_output = await selected_tool.ainvoke(tool_args) + logger.info(f"Tool '{tool_name}' executed successfully.") + + if tool_name == "parallel_browser_search": + current_search_results.extend(tool_output) # tool_output is List[Dict] + else: # For other tools, we might need specific handling or just log + logger.info(f"Result from tool '{tool_name}': {str(tool_output)[:200]}...") + # Storing non-browser results might need a different structure or key in search_results + current_search_results.append( + {"tool_name": tool_name, "args": tool_args, "output": str(tool_output), + "status": "completed"}) + + tool_results.append(ToolMessage(content=json.dumps(tool_output), tool_call_id=tool_call_id)) + + except Exception as e: + logger.error(f"Error executing tool '{tool_name}': {e}", exc_info=True) + tool_results.append( + ToolMessage(content=f"Error executing tool {tool_name}: {e}", tool_call_id=tool_call_id)) + current_search_results.append( + {"tool_name": tool_name, "args": tool_args, "status": "failed", "error": str(e)}) + + # After processing all tool calls for this task + step_failed_tool_execution = any("Error:" in str(tr.content) for tr in tool_results) + # Consider a task successful if a browser search was attempted and didn't immediately error out during call + # The browser search itself returns status for each query. + browser_tool_attempted_successfully = "parallel_browser_search" in executed_tool_names and not step_failed_tool_execution + + if step_failed_tool_execution: + current_task["status"] = "failed" + current_task[ + "result_summary"] = f"Tool execution failed. Errors: {[tr.content for tr in tool_results if 'Error' in str(tr.content)]}" + elif executed_tool_names: # If any tool was called + current_task["status"] = "completed" + current_task["result_summary"] = f"Executed tool(s): {', '.join(executed_tool_names)}." + # TODO: Could ask LLM to summarize the tool_results for this task if needed, rather than just listing tools. + else: # No tool calls but AI response had .tool_calls structure (empty) + current_task["status"] = "failed" # Or a more specific status + current_task["result_summary"] = "LLM prepared for tool call but provided no tools." + + # Save progress _save_plan_to_md(plan, output_dir) _save_search_results_to_json(current_search_results, output_dir) + # Determine next indices + next_task_idx = task_idx + 1 + next_cat_idx = cat_idx + if next_task_idx >= len(current_category["tasks"]): + next_cat_idx += 1 + next_task_idx = 0 + + updated_messages = state["messages"] + current_task_message_history + [ai_response] + tool_results + return { "research_plan": plan, - "search_results": current_search_results, # Update with new results - "current_step_index": current_index + 1, - "messages": state["messages"] - + current_task_message - + [ai_response] - + tool_results, - # Optionally return the tool_results messages if needed by downstream nodes + "search_results": current_search_results, + "current_category_index": next_cat_idx, + "current_task_index_in_category": next_task_idx, + "messages": updated_messages, } except Exception as e: - logger.error( - f"Unhandled error during research execution node for step {current_step['step']}: {e}", - exc_info=True, - ) - current_step["status"] = "failed" + logger.error(f"Unhandled error during research execution for task '{current_task['task_description']}': {e}", + exc_info=True) + current_task["status"] = "failed" _save_plan_to_md(plan, output_dir) + # Determine next indices even on error to attempt to move on + next_task_idx = task_idx + 1 + next_cat_idx = cat_idx + if next_task_idx >= len(current_category["tasks"]): + next_cat_idx += 1 + next_task_idx = 0 return { "research_plan": plan, - "current_step_index": current_index + 1, # Move on even if error? - "error_message": f"Core Execution Error on step {current_step['step']}: {e}", + "current_category_index": next_cat_idx, + "current_task_index_in_category": next_task_idx, + "error_message": f"Core Execution Error on task '{current_task['task_description']}': {e}", + "messages": state["messages"] + current_task_message_history # Preserve messages up to error } @@ -747,36 +833,37 @@ async def synthesis_node(state: DeepResearchState) -> Dict[str, Any]: references = {} ref_count = 1 for i, result_entry in enumerate(search_results): - query = result_entry.get("query", "Unknown Query") + query = result_entry.get("query", "Unknown Query") # From parallel_browser_search + tool_name = result_entry.get("tool_name") # From other tools status = result_entry.get("status", "unknown") - result_data = result_entry.get( - "result" - ) # This should be the dict with summary, title, url - error = result_entry.get("error") + result_data = result_entry.get("result") # From BrowserUseAgent's final_result + tool_output_str = result_entry.get("output") # From other tools - if status == "completed" and result_data: - summary = result_data - formatted_results += f'### Finding from Query: "{query}"\n' - formatted_results += f"- **Summary:**\n{summary}\n" + if tool_name == "parallel_browser_search" and status == "completed" and result_data: + # result_data is the summary from BrowserUseAgent + formatted_results += f'### Finding from Web Search Query: "{query}"\n' + formatted_results += f"- **Summary:**\n{result_data}\n" # result_data is already a summary string here + # If result_data contained title/URL, you'd format them here. + # The current BrowserUseAgent returns a string summary directly as 'final_data' in run_single_browser_task + formatted_results += "---\n" + elif tool_name != "parallel_browser_search" and status == "completed" and tool_output_str: + formatted_results += f'### Finding from Tool: "{tool_name}" (Args: {result_entry.get("args")})\n' + formatted_results += f"- **Output:**\n{tool_output_str}\n" formatted_results += "---\n" - elif status == "failed": - formatted_results += f'### Failed Query: "{query}"\n' + error = result_entry.get("error") + q_or_t = f"Query: \"{query}\"" if query != "Unknown Query" else f"Tool: \"{tool_name}\"" + formatted_results += f'### Failed {q_or_t}\n' formatted_results += f"- **Error:** {error}\n" formatted_results += "---\n" - # Ignore cancelled/other statuses for the report content # Prepare the research plan context plan_summary = "\nResearch Plan Followed:\n" - for item in plan: - marker = ( - "- [x]" - if item["status"] == "completed" - else "- [ ] (Failed)" - if item["status"] == "failed" - else "- [ ]" - ) - plan_summary += f"{marker} {item['task']}\n" + for cat_idx, category in enumerate(plan): + plan_summary += f"\n#### Category {cat_idx + 1}: {category['category_name']}\n" + for task_idx, task in enumerate(category['tasks']): + marker = "[x]" if task["status"] == "completed" else "[ ]" if task["status"] == "pending" else "[-]" + plan_summary += f" - {marker} {task['task_description']}\n" synthesis_prompt = ChatPromptTemplate.from_messages( [ @@ -785,29 +872,28 @@ async def synthesis_node(state: DeepResearchState) -> Dict[str, Any]: """You are a professional researcher tasked with writing a comprehensive and well-structured report based on collected findings. The report should address the research topic thoroughly, synthesizing the information gathered from various sources. Structure the report logically: - 1. **Introduction:** Briefly introduce the topic and the report's scope (mentioning the research plan followed is good). - 2. **Main Body:** Discuss the key findings, organizing them thematically or according to the research plan steps. Analyze, compare, and contrast information from different sources where applicable. **Crucially, cite your sources using bracketed numbers [X] corresponding to the reference list.** - 3. **Conclusion:** Summarize the main points and offer concluding thoughts or potential areas for further research. + 1. Briefly introduce the topic and the report's scope (mentioning the research plan followed, including categories and tasks, is good). + 2. Discuss the key findings, organizing them thematically, possibly aligning with the research categories. Analyze, compare, and contrast information. + 3. Summarize the main points and offer concluding thoughts. - Ensure the tone is objective, professional, and analytical. Base the report **strictly** on the provided findings. Do not add external knowledge. If findings are contradictory or incomplete, acknowledge this. - """, + Ensure the tone is objective and professional. + If findings are contradictory or incomplete, acknowledge this. + """, # Removed citation part for simplicity for now, as browser agent returns summaries. ), ( "human", f""" - **Research Topic:** {topic} + **Research Topic:** {topic} - {plan_summary} + {plan_summary} - **Collected Findings:** - ``` - {formatted_results} - ``` + **Collected Findings:** + ``` + {formatted_results} + ``` - ``` - - Please generate the final research report in Markdown format based **only** on the information above. Ensure all claims derived from the findings are properly cited using the format [Reference_ID]. - """, + Please generate the final research report in Markdown format based **only** on the information above. + """, ), ] ) @@ -818,7 +904,6 @@ async def synthesis_node(state: DeepResearchState) -> Dict[str, Any]: topic=topic, plan_summary=plan_summary, formatted_results=formatted_results, - references=references, ).to_messages() ) final_report_md = response.content @@ -847,34 +932,44 @@ async def synthesis_node(state: DeepResearchState) -> Dict[str, Any]: def should_continue(state: DeepResearchState) -> str: - """Determines the next step based on the current state.""" logger.info("--- Evaluating Condition: Should Continue? ---") if state.get("stop_requested"): logger.info("Stop requested, routing to END.") - return "end_run" # Go to a dedicated end node for cleanup if needed - if state.get("error_message"): - logger.warning(f"Error detected: {state['error_message']}. Routing to END.") - # Decide if errors should halt execution or if it should try to synthesize anyway - return "end_run" # Stop on error for now + return "end_run" + if state.get("error_message") and "Core Execution Error" in state["error_message"]: # Critical error in node + logger.warning(f"Critical error detected: {state['error_message']}. Routing to END.") + return "end_run" plan = state.get("research_plan") - current_index = state.get("current_step_index", 0) + cat_idx = state.get("current_category_index", 0) + task_idx = state.get("current_task_index_in_category", 0) # This is the *next* task to check if not plan: - logger.warning( - "No research plan found, cannot continue execution. Routing to END." - ) - return "end_run" # Should not happen if planning node ran correctly + logger.warning("No research plan found. Routing to END.") + return "end_run" - # Check if there are pending steps in the plan - if current_index < len(plan): - logger.info( - f"Plan has pending steps (current index {current_index}/{len(plan)}). Routing to Research Execution." - ) - return "execute_research" - else: - logger.info("All plan steps processed. Routing to Synthesis.") - return "synthesize_report" + # Check if the current indices point to a valid pending task + if cat_idx < len(plan): + current_category = plan[cat_idx] + if task_idx < len(current_category["tasks"]): + # We are trying to execute the task at plan[cat_idx]["tasks"][task_idx] + # The research_execution_node will handle if it's already completed. + logger.info( + f"Plan has potential pending tasks (next up: Category {cat_idx}, Task {task_idx}). Routing to Research Execution." + ) + return "execute_research" + else: # task_idx is out of bounds for current category, means we need to check next category + if cat_idx + 1 < len(plan): # If there is a next category + logger.info( + f"Finished tasks in category {cat_idx}. Moving to category {cat_idx + 1}. Routing to Research Execution." + ) + # research_execution_node will update state to {current_category_index: cat_idx + 1, current_task_index_in_category: 0} + # Or rather, the previous execution node already set these indices to the start of the next category. + return "execute_research" + + # If we've gone through all categories and tasks (cat_idx >= len(plan)) + logger.info("All plan categories and tasks processed or current indices are out of bounds. Routing to Synthesis.") + return "synthesize_report" # --- DeepSearchAgent Class --- @@ -1033,22 +1128,24 @@ class DeepResearchAgent: "messages": [], "llm": self.llm, "tools": agent_tools, - "output_dir": output_dir, + "output_dir": Path(output_dir), "browser_config": self.browser_config, "final_report": None, - "current_step_index": 0, + "current_category_index": 0, + "current_task_index_in_category": 0, "stop_requested": False, "error_message": None, } - loaded_state = {} if task_id: logger.info(f"Attempting to resume task {task_id}...") loaded_state = _load_previous_state(task_id, output_dir) initial_state.update(loaded_state) if loaded_state.get("research_plan"): logger.info( - f"Resuming with {len(loaded_state['research_plan'])} plan steps and {len(loaded_state.get('search_results', []))} existing results." + f"Resuming with {len(loaded_state['research_plan'])} plan categories " + f"and {len(loaded_state.get('search_results', []))} existing results. " + f"Next task: Cat {initial_state['current_category_index']}, Task {initial_state['current_task_index_in_category']}" ) initial_state["topic"] = ( topic # Allow overriding topic even when resuming? Or use stored topic? Let's use new one. @@ -1057,7 +1154,6 @@ class DeepResearchAgent: logger.warning( f"Resume requested for {task_id}, but no previous plan found. Starting fresh." ) - initial_state["current_step_index"] = 0 # --- Execute Graph using ainvoke --- final_state = None diff --git a/src/webui/components/browser_settings_tab.py b/src/webui/components/browser_settings_tab.py index 40c104c..e502d9e 100644 --- a/src/webui/components/browser_settings_tab.py +++ b/src/webui/components/browser_settings_tab.py @@ -1,3 +1,5 @@ +import os + import gradio as gr import logging from gradio.components import Component @@ -56,7 +58,7 @@ def create_browser_settings_tab(webui_manager: WebuiManager): ) keep_browser_open = gr.Checkbox( label="Keep Browser Open", - value=True, + value=os.getenv("KEEP_BROWSER_OPEN", True), info="Keep Browser Open between Tasks", interactive=True ) @@ -91,6 +93,7 @@ def create_browser_settings_tab(webui_manager: WebuiManager): with gr.Row(): cdp_url = gr.Textbox( label="CDP URL", + value=os.getenv("BROWSER_CDP", None), info="CDP URL for browser remote debugging", interactive=True, ) diff --git a/src/webui/components/browser_use_agent_tab.py b/src/webui/components/browser_use_agent_tab.py index b3c00a0..1a292dd 100644 --- a/src/webui/components/browser_use_agent_tab.py +++ b/src/webui/components/browser_use_agent_tab.py @@ -13,7 +13,7 @@ from browser_use.agent.views import ( AgentOutput, ) from browser_use.browser.browser import BrowserConfig -from browser_use.browser.context import BrowserContext, BrowserContextWindowSize, BrowserContextConfig +from browser_use.browser.context import BrowserContext, BrowserContextConfig from browser_use.browser.views import BrowserState from gradio.components import Component from langchain_core.language_models.chat_models import BaseChatModel @@ -451,20 +451,16 @@ async def run_agent_task( if not webui_manager.bu_browser: logger.info("Launching new browser instance.") extra_args = [f"--window-size={window_w},{window_h}"] - if browser_user_data_dir: - extra_args.append(f"--user-data-dir={browser_user_data_dir}") - if use_own_browser: - browser_binary_path = ( - os.getenv("CHROME_PATH", None) or browser_binary_path - ) + browser_binary_path = os.getenv("BROWSER_PATH", None) or browser_binary_path if browser_binary_path == "": browser_binary_path = None - chrome_user_data = os.getenv("CHROME_USER_DATA", None) - if chrome_user_data: - extra_args += [f"--user-data-dir={chrome_user_data}"] + browser_user_data = browser_user_data_dir or os.getenv("BROWSER_USER_DATA", None) + if browser_user_data: + extra_args += [f"--user-data-dir={browser_user_data}"] else: browser_binary_path = None + webui_manager.bu_browser = CustomBrowser( config=BrowserConfig( headless=headless, @@ -485,7 +481,8 @@ async def run_agent_task( if save_recording_path else None, save_downloads_path=save_download_path if save_download_path else None, - browser_window_size=BrowserContextWindowSize(width=window_w, height=window_h), + window_height=window_h, + window_width=window_w, ) if not webui_manager.bu_browser: raise ValueError("Browser not initialized, cannot create context.") diff --git a/supervisord.conf b/supervisord.conf index 3410b91..2c5d8b4 100644 --- a/supervisord.conf +++ b/supervisord.conf @@ -66,8 +66,8 @@ startsecs=3 depends_on=x11vnc [program:persistent_browser] -environment=START_URL="data:text/html,

Browser Ready

" -command=bash -c "mkdir -p /app/data/chrome_data && sleep 8 && $(find /ms-playwright/chromium-*/chrome-linux -name chrome) --user-data-dir=/app/data/chrome_data --window-position=0,0 --window-size=%(ENV_RESOLUTION_WIDTH)s,%(ENV_RESOLUTION_HEIGHT)s --start-maximized --no-sandbox --disable-dev-shm-usage --disable-gpu --disable-software-rasterizer --disable-setuid-sandbox --no-first-run --no-default-browser-check --no-experiments --ignore-certificate-errors --remote-debugging-port=9222 --remote-debugging-address=0.0.0.0 \"$START_URL\"" +environment=START_URL="data:text/html,

Browser Ready

",BROWSER_USER_DATA="/app/data/chrome_data",BROWSER_DEBUGGING_PORT="%(ENV_BROWSER_DEBUGGING_PORT)s",BROWSER_DEBUGGING_HOST="%(ENV_BROWSER_DEBUGGING_HOST)s" +command=bash -c "mkdir -p %(ENV_BROWSER_USER_DATA)s && sleep 8 && $(find $PLAYWRIGHT_BROWSERS_PATH/chrome-*/chrome-linux -name chrome || find /root/.cache/ms-playwright/chrome-*/chrome-linux -name chrome || find /opt/google/chrome -name chrome || echo \"/usr/bin/google-chrome-stable\") --user-data-dir=%(ENV_BROWSER_USER_DATA)s --window-position=0,0 --window-size=%(ENV_RESOLUTION_WIDTH)s,%(ENV_RESOLUTION_HEIGHT)s --start-maximized --no-sandbox --disable-dev-shm-usage --disable-gpu --disable-software-rasterizer --disable-setuid-sandbox --no-first-run --no-default-browser-check --no-experiments --ignore-certificate-errors --remote-debugging-port=%(ENV_BROWSER_DEBUGGING_PORT)s --remote-debugging-address=%(ENV_BROWSER_DEBUGGING_HOST)s --enable-features=NetworkService,NetworkServiceInProcess --disable-features=ImprovedCookieControls \"$START_URL\"" autorestart=true stdout_logfile=/dev/stdout stdout_logfile_maxbytes=0 @@ -93,4 +93,4 @@ startretries=3 startsecs=3 stopsignal=TERM stopwaitsecs=10 -depends_on=persistent_browser +depends_on=persistent_browser \ No newline at end of file diff --git a/tests/test_agents.py b/tests/test_agents.py index d485c70..1285167 100644 --- a/tests/test_agents.py +++ b/tests/test_agents.py @@ -20,8 +20,7 @@ from src.utils import utils async def test_browser_use_agent(): from browser_use.browser.browser import Browser, BrowserConfig from browser_use.browser.context import ( - BrowserContextConfig, - BrowserContextWindowSize, + BrowserContextConfig ) from browser_use.agent.service import Agent @@ -38,12 +37,12 @@ async def test_browser_use_agent(): # api_key=os.getenv("OPENAI_API_KEY", ""), # ) - # llm = utils.get_llm_model( - # provider="google", - # model_name="gemini-2.0-flash", - # temperature=0.6, - # api_key=os.getenv("GOOGLE_API_KEY", "") - # ) + llm = llm_provider.get_llm_model( + provider="google", + model_name="gemini-2.0-flash", + temperature=0.6, + api_key=os.getenv("GOOGLE_API_KEY", "") + ) # llm = utils.get_llm_model( # provider="deepseek", @@ -67,13 +66,13 @@ async def test_browser_use_agent(): window_w, window_h = 1280, 1100 - llm = llm_provider.get_llm_model( - provider="azure_openai", - model_name="gpt-4o", - temperature=0.5, - base_url=os.getenv("AZURE_OPENAI_ENDPOINT", ""), - api_key=os.getenv("AZURE_OPENAI_API_KEY", ""), - ) + # llm = llm_provider.get_llm_model( + # provider="azure_openai", + # model_name="gpt-4o", + # temperature=0.5, + # base_url=os.getenv("AZURE_OPENAI_ENDPOINT", ""), + # api_key=os.getenv("AZURE_OPENAI_API_KEY", ""), + # ) mcp_server_config = { "mcpServers": { @@ -98,7 +97,6 @@ async def test_browser_use_agent(): controller = CustomController() await controller.setup_mcp_client(mcp_server_config) use_own_browser = True - disable_security = False use_vision = True # Set to False when using DeepSeek max_actions_per_step = 10 @@ -106,33 +104,30 @@ async def test_browser_use_agent(): browser_context = None try: - extra_chromium_args = [f"--window-size={window_w},{window_h}"] + extra_browser_args = [f"--window-size={window_w},{window_h}"] if use_own_browser: - chrome_path = os.getenv("CHROME_PATH", None) - if chrome_path == "": - chrome_path = None - chrome_user_data = os.getenv("CHROME_USER_DATA", None) - if chrome_user_data: - extra_chromium_args += [f"--user-data-dir={chrome_user_data}"] + browser_binary_path = os.getenv("BROWSER_PATH", None) + if browser_binary_path == "": + browser_binary_path = None + browser_user_data = os.getenv("BROWSER_USER_DATA", None) + if browser_user_data: + extra_browser_args += [f"--user-data-dir={browser_user_data}"] else: - chrome_path = None + browser_binary_path = None browser = CustomBrowser( config=BrowserConfig( headless=False, - disable_security=disable_security, - browser_binary_path=chrome_path, - extra_browser_args=extra_chromium_args, + browser_binary_path=browser_binary_path, + extra_browser_args=extra_browser_args, ) ) browser_context = await browser.new_context( config=BrowserContextConfig( - trace_path="./tmp/traces", - save_recording_path="./tmp/record_videos", + trace_path=None, + save_recording_path=None, save_downloads_path="./tmp/downloads", - browser_window_size=BrowserContextWindowSize( - width=window_w, height=window_h - ), - force_new_context=True + window_height=window_h, + window_width=window_w, ) ) agent = BrowserUseAgent( @@ -167,17 +162,9 @@ async def test_browser_use_agent(): async def test_browser_use_parallel(): - from browser_use.browser.context import BrowserContextWindowSize - from browser_use.browser.browser import BrowserConfig - from patchright.async_api import async_playwright - from browser_use.browser.browser import Browser - from src.browser.custom_context import BrowserContextConfig - from src.controller.custom_controller import CustomController - from browser_use.browser.browser import Browser, BrowserConfig from browser_use.browser.context import ( BrowserContextConfig, - BrowserContextWindowSize, ) from browser_use.agent.service import Agent @@ -261,8 +248,7 @@ async def test_browser_use_parallel(): } controller = CustomController() await controller.setup_mcp_client(mcp_server_config) - use_own_browser = False - disable_security = False + use_own_browser = True use_vision = True # Set to False when using DeepSeek max_actions_per_step = 10 @@ -270,32 +256,30 @@ async def test_browser_use_parallel(): browser_context = None try: - extra_chromium_args = [f"--window-size={window_w},{window_h}"] + extra_browser_args = [f"--window-size={window_w},{window_h}"] if use_own_browser: - chrome_path = os.getenv("CHROME_PATH", None) - if chrome_path == "": - chrome_path = None - chrome_user_data = os.getenv("CHROME_USER_DATA", None) - if chrome_user_data: - extra_chromium_args += [f"--user-data-dir={chrome_user_data}"] + browser_binary_path = os.getenv("BROWSER_PATH", None) + if browser_binary_path == "": + browser_binary_path = None + browser_user_data = os.getenv("BROWSER_USER_DATA", None) + if browser_user_data: + extra_browser_args += [f"--user-data-dir={browser_user_data}"] else: - chrome_path = None + browser_binary_path = None browser = CustomBrowser( config=BrowserConfig( headless=False, - disable_security=disable_security, - browser_binary_path=chrome_path, - extra_browser_args=extra_chromium_args, + browser_binary_path=browser_binary_path, + extra_browser_args=extra_browser_args, ) ) browser_context = await browser.new_context( config=BrowserContextConfig( - trace_path="./tmp/traces", - save_recording_path="./tmp/record_videos", + trace_path=None, + save_recording_path=None, save_downloads_path="./tmp/downloads", - browser_window_size=BrowserContextWindowSize( - width=window_w, height=window_h - ), + window_height=window_h, + window_width=window_w, force_new_context=True ) ) @@ -364,7 +348,7 @@ async def test_deep_research_agent(): browser_config = {"headless": False, "window_width": 1280, "window_height": 1100, "use_own_browser": False} agent = DeepResearchAgent(llm=llm, browser_config=browser_config, mcp_server_config=mcp_server_config) - research_topic = "Give me a detailed travel plan to Switzerland from June 1st to 10th." + research_topic = "Give me investment advices of nvidia and tesla." task_id_to_resume = "" # Set this to resume a previous task ID print(f"Starting research on: {research_topic}") @@ -405,6 +389,6 @@ async def test_deep_research_agent(): if __name__ == "__main__": - # asyncio.run(test_browser_use_agent()) + asyncio.run(test_browser_use_agent()) # asyncio.run(test_browser_use_parallel()) - asyncio.run(test_deep_research_agent()) + # asyncio.run(test_deep_research_agent())