add browser-use agent run

2026-03-22 11:17:17 +08:00 · 2025-04-28 22:11:56 +08:00
parent 0d259efbeb
commit 4c87694cef
18 changed files with 1343 additions and 523 deletions
--- a/tests/test_agents.py
+++ b/tests/test_agents.py
@@ -17,98 +17,18 @@ from browser_use.agent.views import AgentHistoryList
 from src.utils import utils


-async def test_browser_use_org():
+async def test_browser_use_agent():
    from browser_use.browser.browser import Browser, BrowserConfig
    from browser_use.browser.context import (
        BrowserContextConfig,
        BrowserContextWindowSize,
    )
+    from browser_use.agent.service import Agent

-    # llm = utils.get_llm_model(
-    #     provider="azure_openai",
-    #     model_name="gpt-4o",
-    #     temperature=0.8,
-    #     base_url=os.getenv("AZURE_OPENAI_ENDPOINT", ""),
-    #     api_key=os.getenv("AZURE_OPENAI_API_KEY", ""),
-    # )
-
-    # llm = utils.get_llm_model(
-    #     provider="deepseek",
-    #     model_name="deepseek-chat",
-    #     temperature=0.8
-    # )
-
-    llm = utils.get_llm_model(
-        provider="ollama", model_name="deepseek-r1:14b", temperature=0.5
-    )
-
-    window_w, window_h = 1920, 1080
-    use_vision = False
-    use_own_browser = False
-    if use_own_browser:
-        chrome_path = os.getenv("CHROME_PATH", None)
-        if chrome_path == "":
-            chrome_path = None
-    else:
-        chrome_path = None
-
-    tool_calling_method = "json_schema"  # setting to json_schema when using ollma
-
-    browser = Browser(
-        config=BrowserConfig(
-            headless=False,
-            disable_security=True,
-            chrome_instance_path=chrome_path,
-            extra_chromium_args=[f"--window-size={window_w},{window_h}"],
-        )
-    )
-    async with await browser.new_context(
-            config=BrowserContextConfig(
-                trace_path="./tmp/traces",
-                save_recording_path="./tmp/record_videos",
-                no_viewport=False,
-                browser_window_size=BrowserContextWindowSize(
-                    width=window_w, height=window_h
-                ),
-            )
-    ) as browser_context:
-        agent = Agent(
-            task="go to google.com and type 'OpenAI' click search and give me the first url",
-            llm=llm,
-            browser_context=browser_context,
-            use_vision=use_vision,
-            tool_calling_method=tool_calling_method
-        )
-        history: AgentHistoryList = await agent.run(max_steps=10)
-
-        print("Final Result:")
-        pprint(history.final_result(), indent=4)
-
-        print("\nErrors:")
-        pprint(history.errors(), indent=4)
-
-        # e.g. xPaths the model clicked on
-        print("\nModel Outputs:")
-        pprint(history.model_actions(), indent=4)
-
-        print("\nThoughts:")
-        pprint(history.model_thoughts(), indent=4)
-    # close browser
-    await browser.close()
-
-
-async def test_browser_use_custom():
-    from browser_use.browser.context import BrowserContextWindowSize
-    from browser_use.browser.browser import BrowserConfig
-    from playwright.async_api import async_playwright
-
-    from src.agent.custom_agent import CustomAgent
-    from src.agent.custom_prompts import CustomSystemPrompt, CustomAgentMessagePrompt
    from src.browser.custom_browser import CustomBrowser
-    from src.browser.custom_context import BrowserContextConfig
+    from src.browser.custom_context import CustomBrowserContextConfig
    from src.controller.custom_controller import CustomController
-
-    window_w, window_h = 1280, 1100
+    from src.utils import llm_provider

    # llm = utils.get_llm_model(
    #     provider="openai",
@@ -118,14 +38,6 @@ async def test_browser_use_custom():
    #     api_key=os.getenv("OPENAI_API_KEY", ""),
    # )

-    llm = utils.get_llm_model(
-        provider="azure_openai",
-        model_name="gpt-4o",
-        temperature=0.5,
-        base_url=os.getenv("AZURE_OPENAI_ENDPOINT", ""),
-        api_key=os.getenv("AZURE_OPENAI_API_KEY", ""),
-    )
-
    # llm = utils.get_llm_model(
    #     provider="google",
    #     model_name="gemini-2.0-flash",
@@ -153,13 +65,43 @@ async def test_browser_use_custom():
    #     provider="ollama", model_name="deepseek-r1:14b", temperature=0.5
    # )

+    window_w, window_h = 1280, 1100
+
+    llm = llm_provider.get_llm_model(
+        provider="azure_openai",
+        model_name="gpt-4o",
+        temperature=0.5,
+        base_url=os.getenv("AZURE_OPENAI_ENDPOINT", ""),
+        api_key=os.getenv("AZURE_OPENAI_API_KEY", ""),
+    )
+
+    mcp_server_config = {
+        "mcpServers": {
+            "markitdown": {
+                "command": "docker",
+                "args": [
+                    "run",
+                    "--rm",
+                    "-i",
+                    "markitdown-mcp:latest"
+                ]
+            },
+            "desktop-commander": {
+                "command": "npx",
+                "args": [
+                    "-y",
+                    "@wonderwhy-er/desktop-commander"
+                ]
+            },
+        }
+    }
    controller = CustomController()
-    use_own_browser = True
+    await controller.setup_mcp_client(mcp_server_config)
+    use_own_browser = False
    disable_security = True
    use_vision = True  # Set to False when using DeepSeek

    max_actions_per_step = 10
-    playwright = None
    browser = None
    browser_context = None

@@ -178,29 +120,27 @@ async def test_browser_use_custom():
            config=BrowserConfig(
                headless=False,
                disable_security=disable_security,
-                chrome_instance_path=chrome_path,
-                extra_chromium_args=extra_chromium_args,
+                browser_binary_path=chrome_path,
+                extra_browser_args=extra_chromium_args,
            )
        )
        browser_context = await browser.new_context(
-            config=BrowserContextConfig(
+            config=CustomBrowserContextConfig(
                trace_path="./tmp/traces",
                save_recording_path="./tmp/record_videos",
-                no_viewport=False,
+                save_downloads_path="./tmp/downloads",
                browser_window_size=BrowserContextWindowSize(
                    width=window_w, height=window_h
                ),
+                force_new_context=True
            )
        )
-        agent = CustomAgent(
-            task="open youtube in tab 1 , open google email in tab 2, open facebook in tab 3",
-            add_infos="",  # some hints for llm to complete the task
+        agent = Agent(
+            task="download pdf from https://arxiv.org/abs/2504.10458 and rename this pdf to 'GUI-r1-test.pdf'",
            llm=llm,
            browser=browser,
            browser_context=browser_context,
            controller=controller,
-            system_prompt_class=CustomSystemPrompt,
-            agent_prompt_class=CustomAgentMessagePrompt,
            use_vision=use_vision,
            max_actions_per_step=max_actions_per_step,
            generate_gif=True
@@ -213,28 +153,17 @@ async def test_browser_use_custom():
        print("\nErrors:")
        pprint(history.errors(), indent=4)

-        # e.g. xPaths the model clicked on
-        print("\nModel Outputs:")
-        pprint(history.model_actions(), indent=4)
-
-        print("\nThoughts:")
-        pprint(history.model_thoughts(), indent=4)
-

    except Exception:
        import traceback
-
        traceback.print_exc()
    finally:
-        # 显式关闭持久化上下文
        if browser_context:
            await browser_context.close()
-
-        # 关闭 Playwright 对象
-        if playwright:
-            await playwright.stop()
        if browser:
            await browser.close()
+        if controller:
+            await controller.close_mcp_client()


 async def test_browser_use_parallel():
@@ -242,13 +171,20 @@ async def test_browser_use_parallel():
    from browser_use.browser.browser import BrowserConfig
    from playwright.async_api import async_playwright
    from browser_use.browser.browser import Browser
-    from src.agent.custom_agent import CustomAgent
-    from src.agent.custom_prompts import CustomSystemPrompt, CustomAgentMessagePrompt
-    from src.browser.custom_browser import CustomBrowser
    from src.browser.custom_context import BrowserContextConfig
    from src.controller.custom_controller import CustomController

-    window_w, window_h = 1920, 1080
+    from browser_use.browser.browser import Browser, BrowserConfig
+    from browser_use.browser.context import (
+        BrowserContextConfig,
+        BrowserContextWindowSize,
+    )
+    from browser_use.agent.service import Agent
+
+    from src.browser.custom_browser import CustomBrowser
+    from src.browser.custom_context import CustomBrowserContextConfig
+    from src.controller.custom_controller import CustomController
+    from src.utils import llm_provider

    # llm = utils.get_llm_model(
    #     provider="openai",
@@ -258,20 +194,13 @@ async def test_browser_use_parallel():
    #     api_key=os.getenv("OPENAI_API_KEY", ""),
    # )

-    # llm = utils.get_llm_model(
-    #     provider="azure_openai",
-    #     model_name="gpt-4o",
-    #     temperature=0.8,
-    #     base_url=os.getenv("AZURE_OPENAI_ENDPOINT", ""),
-    #     api_key=os.getenv("AZURE_OPENAI_API_KEY", ""),
-    # )

-    llm = utils.get_llm_model(
-        provider="gemini",
-        model_name="gemini-2.0-flash-exp",
-        temperature=1.0,
-        api_key=os.getenv("GOOGLE_API_KEY", "")
-    )
+    # llm = utils.get_llm_model(
+    #     provider="google",
+    #     model_name="gemini-2.0-flash",
+    #     temperature=0.6,
+    #     api_key=os.getenv("GOOGLE_API_KEY", "")
+    # )

    # llm = utils.get_llm_model(
    #     provider="deepseek",
@@ -293,72 +222,119 @@ async def test_browser_use_parallel():
    #     provider="ollama", model_name="deepseek-r1:14b", temperature=0.5
    # )

+    window_w, window_h = 1280, 1100
+
+    llm = llm_provider.get_llm_model(
+        provider="azure_openai",
+        model_name="gpt-4o",
+        temperature=0.5,
+        base_url=os.getenv("AZURE_OPENAI_ENDPOINT", ""),
+        api_key=os.getenv("AZURE_OPENAI_API_KEY", ""),
+    )
+
+    mcp_server_config = {
+        "mcpServers": {
+            "markitdown": {
+                "command": "docker",
+                "args": [
+                    "run",
+                    "--rm",
+                    "-i",
+                    "markitdown-mcp:latest"
+                ]
+            },
+            "desktop-commander": {
+                "command": "npx",
+                "args": [
+                    "-y",
+                    "@wonderwhy-er/desktop-commander"
+                ]
+            },
+            # "filesystem": {
+            #     "command": "npx",
+            #     "args": [
+            #         "-y",
+            #         "@modelcontextprotocol/server-filesystem",
+            #         "/Users/xxx/ai_workspace",
+            #     ]
+            # },
+        }
+    }
    controller = CustomController()
-    use_own_browser = True
+    await controller.setup_mcp_client(mcp_server_config)
+    use_own_browser = False
    disable_security = True
    use_vision = True  # Set to False when using DeepSeek

-    max_actions_per_step = 1
-    playwright = None
+    max_actions_per_step = 10
    browser = None
    browser_context = None

-    browser = Browser(
-        config=BrowserConfig(
-            disable_security=True,
-            headless=False,
-            new_context_config=BrowserContextConfig(save_recording_path='./tmp/recordings'),
-        )
-    )
-
    try:
+        extra_chromium_args = [f"--window-size={window_w},{window_h}"]
+        if use_own_browser:
+            chrome_path = os.getenv("CHROME_PATH", None)
+            if chrome_path == "":
+                chrome_path = None
+            chrome_user_data = os.getenv("CHROME_USER_DATA", None)
+            if chrome_user_data:
+                extra_chromium_args += [f"--user-data-dir={chrome_user_data}"]
+        else:
+            chrome_path = None
+        browser = CustomBrowser(
+            config=BrowserConfig(
+                headless=False,
+                disable_security=disable_security,
+                browser_binary_path=chrome_path,
+                extra_browser_args=extra_chromium_args,
+            )
+        )
+        browser_context = await browser.new_context(
+            config=CustomBrowserContextConfig(
+                trace_path="./tmp/traces",
+                save_recording_path="./tmp/record_videos",
+                save_downloads_path="./tmp/downloads",
+                browser_window_size=BrowserContextWindowSize(
+                    width=window_w, height=window_h
+                ),
+                force_new_context=True
+            )
+        )
        agents = [
-            Agent(task=task, llm=llm, browser=browser)
+            Agent(task=task, llm=llm, browser=browser, controller=controller)
            for task in [
                'Search Google for weather in Tokyo',
-                'Check Reddit front page title',
-                'Find NASA image of the day',
-                'Check top story on CNN',
+                # 'Check Reddit front page title',
+                # 'Find NASA image of the day',
+                # 'Check top story on CNN',
                # 'Search latest SpaceX launch date',
                # 'Look up population of Paris',
-                # 'Find current time in Sydney',
-                # 'Check who won last Super Bowl',
+                'Find current time in Sydney',
+                'Check who won last Super Bowl',
                # 'Search trending topics on Twitter',
            ]
        ]

        history = await asyncio.gather(*[agent.run() for agent in agents])
-        pdb.set_trace()
        print("Final Result:")
        pprint(history.final_result(), indent=4)

        print("\nErrors:")
        pprint(history.errors(), indent=4)

-        # e.g. xPaths the model clicked on
-        print("\nModel Outputs:")
-        pprint(history.model_actions(), indent=4)
+        pdb.set_trace()

-        print("\nThoughts:")
-        pprint(history.model_thoughts(), indent=4)
-        # close browser
    except Exception:
        import traceback

        traceback.print_exc()
    finally:
-        # 显式关闭持久化上下文
        if browser_context:
            await browser_context.close()
-
-        # 关闭 Playwright 对象
-        if playwright:
-            await playwright.stop()
        if browser:
            await browser.close()


 if __name__ == "__main__":
-    asyncio.run(test_browser_use_org())
-    # asyncio.run(test_browser_use_parallel())
-    # asyncio.run(test_browser_use_custom())
+    # asyncio.run(test_browser_use_agent())
+    asyncio.run(test_browser_use_parallel())
--- a/tests/test_controller.py
+++ b/tests/test_controller.py
@@ -45,33 +45,37 @@ async def test_controller_with_mcp():
    from src.controller.custom_controller import CustomController
    from browser_use.controller.registry.views import ActionModel

-    test_server_config = {
-        "playwright": {
-            "command": "npx",
-            "args": [
-                "@playwright/mcp@latest",
-            ],
-            "transport": "stdio",
-        },
-        "filesystem": {
-            "command": "npx",
-            "args": [
-                "-y",
-                "@modelcontextprotocol/server-filesystem",
-                "/Users/xxx/ai_workspace",
-            ]
-        },
-        "desktop-commander": {
-            "command": "npx",
-            "args": [
-                "-y",
-                "@wonderwhy-er/desktop-commander"
-            ]
+    mcp_server_config = {
+        "mcpServers": {
+            "markitdown": {
+                "command": "docker",
+                "args": [
+                    "run",
+                    "--rm",
+                    "-i",
+                    "markitdown-mcp:latest"
+                ]
+            },
+            "desktop-commander": {
+                "command": "npx",
+                "args": [
+                    "-y",
+                    "@wonderwhy-er/desktop-commander"
+                ]
+            },
+            # "filesystem": {
+            #     "command": "npx",
+            #     "args": [
+            #         "-y",
+            #         "@modelcontextprotocol/server-filesystem",
+            #         "/Users/xxx/ai_workspace",
+            #     ]
+            # },
        }
    }

    controller = CustomController()
-    await controller.setup_mcp_client(test_server_config)
+    await controller.setup_mcp_client(mcp_server_config)
    action_name = "mcp.desktop-commander.execute_command"
    action_info = controller.registry.registry.actions[action_name]
    param_model = action_info.param_model
@@ -85,7 +89,8 @@ async def test_controller_with_mcp():
    result = await controller.act(action_model)
    result = result.extracted_content
    print(result)
-    if result and "Command is still running. Use read_output to get more output." in result and "PID" in result.split("\n")[0]:
+    if result and "Command is still running. Use read_output to get more output." in result and "PID" in \
+            result.split("\n")[0]:
        pid = int(result.split("\n")[0].split("PID")[-1].strip())
        action_name = "mcp.desktop-commander.read_output"
        action_info = controller.registry.registry.actions[action_name]
--- a/tests/test_llm_api.py
+++ b/tests/test_llm_api.py
@@ -144,10 +144,10 @@ def test_ibm_model():
 if __name__ == "__main__":
    # test_openai_model()
    # test_google_model()
-    # test_azure_openai_model()
+    test_azure_openai_model()
    # test_deepseek_model()
    # test_ollama_model()
    # test_deepseek_r1_model()
    # test_deepseek_r1_ollama_model()
    # test_mistral_model()
-    test_ibm_model()
+    # test_ibm_model()