Merge pull request #3 from warmshao/dev

add deepseek
2026-03-22 11:17:17 +08:00 · 2025-01-03 19:33:28 +08:00
parent bab6627896 c0de4b787d
commit 73cc098254
6 changed files with 83 additions and 12 deletions
--- a/README.md
+++ b/README.md
@@ -6,7 +6,7 @@ This project builds upon the foundation of the [browser-use](https://github.com/

 1.  **A Brand New WebUI:** We offer a comprehensive web interface that supports a wide range of `browser-use` functionalities. This UI is designed to be user-friendly and enables easy interaction with the browser agent.

-2.  **Expanded LLM Support:** We've integrated support for various Large Language Models (LLMs), including: Gemini, OpenAI, Azure OpenAI, Anthropic etc. And we plan to add support for even more models in the future.
+2.  **Expanded LLM Support:** We've integrated support for various Large Language Models (LLMs), including: Gemini, OpenAI, Azure OpenAI, Anthropic, DeepSeek etc. And we plan to add support for even more models in the future.

 3.  **Custom Browser Support:** You can use your own browser with our tool, eliminating the need to re-login to sites or deal with other authentication challenges. This feature also supports high-definition screen recording.

@@ -43,5 +43,6 @@ This project builds upon the foundation of the [browser-use](https://github.com/
    ```
 2.  **Access the WebUI:** Open your web browser and navigate to `http://127.0.0.1:7788`.
 3.  **Using Your Own Browser:**
+    - Close all chrome windows
    - Open the WebUI in a non-Chrome browser, such as Firefox or Edge. This is important because the persistent browser context will use the Chrome data when running the agent.
    - Check the "Use Own Browser" option within the Browser Settings.
--- a/src/agent/custom_agent.py
+++ b/src/agent/custom_agent.py
@@ -151,6 +151,20 @@ class CustomAgent(Agent):
        if completed_contents and 'None' not in completed_contents:
            step_info.task_progress = completed_contents

+    @time_execution_async('--get_next_action')
+    async def get_next_action(self, input_messages: list[BaseMessage]) -> AgentOutput:
+        """Get next action from LLM based on current state"""
+
+        ret = self.llm.invoke(input_messages)
+        parsed_json = json.loads(ret.content.replace('```json', '').replace("```", ""))
+        parsed: AgentOutput = self.AgentOutput(**parsed_json)
+        # cut the number of actions to max_actions_per_step
+        parsed.action = parsed.action[: self.max_actions_per_step]
+        self._log_response(parsed)
+        self.n_steps += 1
+
+        return parsed
+
    @time_execution_async('--step')
    async def step(self, step_info: Optional[CustomAgentStepInfo] = None) -> None:
        """Execute one step of the task"""
--- a/src/utils/utils.py
+++ b/src/utils/utils.py
@@ -48,6 +48,23 @@ def get_llm_model(provider: str, **kwargs):
        else:
            api_key = kwargs.get("api_key")

+        return ChatOpenAI(
+            model=kwargs.get("model_name", 'gpt-4o'),
+            temperature=kwargs.get("temperature", 0.0),
+            base_url=base_url,
+            api_key=api_key
+        )
+    elif provider == 'deepseek':
+        if not kwargs.get("base_url", ""):
+            base_url = os.getenv("DEEPSEEK_ENDPOINT", "")
+        else:
+            base_url = kwargs.get("base_url")
+
+        if not kwargs.get("api_key", ""):
+            api_key = os.getenv("DEEPSEEK_API_KEY", "")
+        else:
+            api_key = kwargs.get("api_key")
+
        return ChatOpenAI(
            model=kwargs.get("model_name", 'gpt-4o'),
            temperature=kwargs.get("temperature", 0.0),
--- a/tests/test_browser_use.py
+++ b/tests/test_browser_use.py
@@ -98,16 +98,23 @@ async def test_browser_use_custom():
    #     api_key=os.getenv("AZURE_OPENAI_API_KEY", "")
    # )

+    # llm = utils.get_llm_model(
+    #     provider="gemini",
+    #     model_name="gemini-2.0-flash-exp",
+    #     temperature=1.0,
+    #     api_key=os.getenv("GOOGLE_API_KEY", "")
+    # )
+
    llm = utils.get_llm_model(
-        provider="gemini",
-        model_name="gemini-2.0-flash-exp",
-        temperature=1.0,
-        api_key=os.getenv("GOOGLE_API_KEY", "")
+        provider="deepseek",
+        model_name="deepseek-chat",
+        temperature=0.8
    )

    controller = CustomController()
    use_own_browser = False
    disable_security = True
+    use_vision = False
    playwright = None
    browser_context_ = None
    try:
@@ -156,7 +163,8 @@ async def test_browser_use_custom():
                llm=llm,
                browser_context=browser_context,
                controller=controller,
-                system_prompt_class=CustomSystemPrompt
+                system_prompt_class=CustomSystemPrompt,
+                use_vision=use_vision
            )
            history: AgentHistoryList = await agent.run(max_steps=10)

--- a/tests/test_llm_api.py
+++ b/tests/test_llm_api.py
@@ -95,7 +95,29 @@ def test_azure_openai_model():
    print(ai_msg.content)


+def test_deepseek_model():
+    from langchain_core.messages import HumanMessage
+    from src.utils import utils
+
+    llm = utils.get_llm_model(
+        provider="deepseek",
+        model_name="deepseek-chat",
+        temperature=0.8,
+        base_url=os.getenv("DEEPSEEK_ENDPOINT", ""),
+        api_key=os.getenv("DEEPSEEK_API_KEY", "")
+    )
+    pdb.set_trace()
+    message = HumanMessage(
+        content=[
+            {"type": "text", "text": "who are you?"}
+        ]
+    )
+    ai_msg = llm.invoke([message])
+    print(ai_msg.content)
+
+
 if __name__ == '__main__':
    # test_openai_model()
-    test_gemini_model()
+    # test_gemini_model()
    # test_azure_openai_model()
+    test_deepseek_model()
--- a/webui.py
+++ b/webui.py
@@ -52,7 +52,8 @@ async def run_browser_agent(
        save_recording_path,
        task,
        add_infos,
-        max_steps
+        max_steps,
+        use_vision
 ):
    """
    Runs the browser agent based on user configurations.
@@ -75,6 +76,7 @@ async def run_browser_agent(
            save_recording_path=save_recording_path,
            task=task,
            max_steps=max_steps,
+            use_vision=use_vision
        )
    elif agent_type == "custom":
        return await run_custom_agent(
@@ -88,6 +90,7 @@ async def run_browser_agent(
            task=task,
            add_infos=add_infos,
            max_steps=max_steps,
+            use_vision=use_vision
        )
    else:
        raise ValueError(f"Invalid agent type: {agent_type}")
@@ -101,7 +104,8 @@ async def run_org_agent(
        window_h,
        save_recording_path,
        task,
-        max_steps
+        max_steps,
+        use_vision
 ):
    browser = Browser(
        config=BrowserConfig(
@@ -121,6 +125,7 @@ async def run_org_agent(
        agent = Agent(
            task=task,
            llm=llm,
+            use_vision=use_vision,
            browser_context=browser_context,
        )
        history = await agent.run(max_steps=max_steps)
@@ -143,7 +148,8 @@ async def run_custom_agent(
        save_recording_path,
        task,
        add_infos,
-        max_steps
+        max_steps,
+        use_vision
 ):
    controller = CustomController()
    playwright = None
@@ -190,6 +196,7 @@ async def run_custom_agent(
            agent = CustomAgent(
                task=task,
                add_infos=add_infos,
+                use_vision=use_vision,
                llm=llm,
                browser_context=browser_context,
                controller=controller,
@@ -245,9 +252,10 @@ def main():
        with gr.Row():
            agent_type = gr.Radio(["org", "custom"], label="Agent Type", value="custom")
            max_steps = gr.Number(label="max run steps", value=100)
+            use_vision = gr.Checkbox(label="use vision", value=True)
        with gr.Row():
            llm_provider = gr.Dropdown(
-                ["anthropic", "openai", "gemini", "azure_openai"], label="LLM Provider", value="gemini"
+                ["anthropic", "openai", "gemini", "azure_openai", "deepseek"], label="LLM Provider", value="gemini"
            )
            llm_model_name = gr.Textbox(label="LLM Model Name", value="gemini-2.0-flash-exp")
            llm_temperature = gr.Number(label="LLM Temperature", value=1.0)
@@ -293,7 +301,8 @@ def main():
                save_recording_path,
                task,
                add_infos,
-                max_steps
+                max_steps,
+                use_vision
            ],
            outputs=[final_result_output, errors_output, model_actions_output, model_thoughts_output],
        )