diff --git a/README.md b/README.md index 59aeebe..ecb9bfe 100644 --- a/README.md +++ b/README.md @@ -6,7 +6,7 @@ This project builds upon the foundation of the [browser-use](https://github.com/ 1. **A Brand New WebUI:** We offer a comprehensive web interface that supports a wide range of `browser-use` functionalities. This UI is designed to be user-friendly and enables easy interaction with the browser agent. -2. **Expanded LLM Support:** We've integrated support for various Large Language Models (LLMs), including: Gemini, OpenAI, Azure OpenAI, Anthropic etc. And we plan to add support for even more models in the future. +2. **Expanded LLM Support:** We've integrated support for various Large Language Models (LLMs), including: Gemini, OpenAI, Azure OpenAI, Anthropic, DeepSeek etc. And we plan to add support for even more models in the future. 3. **Custom Browser Support:** You can use your own browser with our tool, eliminating the need to re-login to sites or deal with other authentication challenges. This feature also supports high-definition screen recording. @@ -43,5 +43,6 @@ This project builds upon the foundation of the [browser-use](https://github.com/ ``` 2. **Access the WebUI:** Open your web browser and navigate to `http://127.0.0.1:7788`. 3. **Using Your Own Browser:** + - Close all chrome windows - Open the WebUI in a non-Chrome browser, such as Firefox or Edge. This is important because the persistent browser context will use the Chrome data when running the agent. - Check the "Use Own Browser" option within the Browser Settings. diff --git a/src/agent/custom_agent.py b/src/agent/custom_agent.py index 5b075dd..027a450 100644 --- a/src/agent/custom_agent.py +++ b/src/agent/custom_agent.py @@ -151,6 +151,20 @@ class CustomAgent(Agent): if completed_contents and 'None' not in completed_contents: step_info.task_progress = completed_contents + @time_execution_async('--get_next_action') + async def get_next_action(self, input_messages: list[BaseMessage]) -> AgentOutput: + """Get next action from LLM based on current state""" + + ret = self.llm.invoke(input_messages) + parsed_json = json.loads(ret.content.replace('```json', '').replace("```", "")) + parsed: AgentOutput = self.AgentOutput(**parsed_json) + # cut the number of actions to max_actions_per_step + parsed.action = parsed.action[: self.max_actions_per_step] + self._log_response(parsed) + self.n_steps += 1 + + return parsed + @time_execution_async('--step') async def step(self, step_info: Optional[CustomAgentStepInfo] = None) -> None: """Execute one step of the task""" diff --git a/src/utils/utils.py b/src/utils/utils.py index 1b8d7af..cc3b9e4 100644 --- a/src/utils/utils.py +++ b/src/utils/utils.py @@ -48,6 +48,23 @@ def get_llm_model(provider: str, **kwargs): else: api_key = kwargs.get("api_key") + return ChatOpenAI( + model=kwargs.get("model_name", 'gpt-4o'), + temperature=kwargs.get("temperature", 0.0), + base_url=base_url, + api_key=api_key + ) + elif provider == 'deepseek': + if not kwargs.get("base_url", ""): + base_url = os.getenv("DEEPSEEK_ENDPOINT", "") + else: + base_url = kwargs.get("base_url") + + if not kwargs.get("api_key", ""): + api_key = os.getenv("DEEPSEEK_API_KEY", "") + else: + api_key = kwargs.get("api_key") + return ChatOpenAI( model=kwargs.get("model_name", 'gpt-4o'), temperature=kwargs.get("temperature", 0.0), diff --git a/tests/test_browser_use.py b/tests/test_browser_use.py index 42ed270..cc6c11e 100644 --- a/tests/test_browser_use.py +++ b/tests/test_browser_use.py @@ -98,16 +98,23 @@ async def test_browser_use_custom(): # api_key=os.getenv("AZURE_OPENAI_API_KEY", "") # ) + # llm = utils.get_llm_model( + # provider="gemini", + # model_name="gemini-2.0-flash-exp", + # temperature=1.0, + # api_key=os.getenv("GOOGLE_API_KEY", "") + # ) + llm = utils.get_llm_model( - provider="gemini", - model_name="gemini-2.0-flash-exp", - temperature=1.0, - api_key=os.getenv("GOOGLE_API_KEY", "") + provider="deepseek", + model_name="deepseek-chat", + temperature=0.8 ) controller = CustomController() use_own_browser = False disable_security = True + use_vision = False playwright = None browser_context_ = None try: @@ -156,7 +163,8 @@ async def test_browser_use_custom(): llm=llm, browser_context=browser_context, controller=controller, - system_prompt_class=CustomSystemPrompt + system_prompt_class=CustomSystemPrompt, + use_vision=use_vision ) history: AgentHistoryList = await agent.run(max_steps=10) diff --git a/tests/test_llm_api.py b/tests/test_llm_api.py index f6c024d..03d5753 100644 --- a/tests/test_llm_api.py +++ b/tests/test_llm_api.py @@ -95,7 +95,29 @@ def test_azure_openai_model(): print(ai_msg.content) +def test_deepseek_model(): + from langchain_core.messages import HumanMessage + from src.utils import utils + + llm = utils.get_llm_model( + provider="deepseek", + model_name="deepseek-chat", + temperature=0.8, + base_url=os.getenv("DEEPSEEK_ENDPOINT", ""), + api_key=os.getenv("DEEPSEEK_API_KEY", "") + ) + pdb.set_trace() + message = HumanMessage( + content=[ + {"type": "text", "text": "who are you?"} + ] + ) + ai_msg = llm.invoke([message]) + print(ai_msg.content) + + if __name__ == '__main__': # test_openai_model() - test_gemini_model() + # test_gemini_model() # test_azure_openai_model() + test_deepseek_model() diff --git a/webui.py b/webui.py index a991c3e..f44bc14 100644 --- a/webui.py +++ b/webui.py @@ -52,7 +52,8 @@ async def run_browser_agent( save_recording_path, task, add_infos, - max_steps + max_steps, + use_vision ): """ Runs the browser agent based on user configurations. @@ -75,6 +76,7 @@ async def run_browser_agent( save_recording_path=save_recording_path, task=task, max_steps=max_steps, + use_vision=use_vision ) elif agent_type == "custom": return await run_custom_agent( @@ -88,6 +90,7 @@ async def run_browser_agent( task=task, add_infos=add_infos, max_steps=max_steps, + use_vision=use_vision ) else: raise ValueError(f"Invalid agent type: {agent_type}") @@ -101,7 +104,8 @@ async def run_org_agent( window_h, save_recording_path, task, - max_steps + max_steps, + use_vision ): browser = Browser( config=BrowserConfig( @@ -121,6 +125,7 @@ async def run_org_agent( agent = Agent( task=task, llm=llm, + use_vision=use_vision, browser_context=browser_context, ) history = await agent.run(max_steps=max_steps) @@ -143,7 +148,8 @@ async def run_custom_agent( save_recording_path, task, add_infos, - max_steps + max_steps, + use_vision ): controller = CustomController() playwright = None @@ -190,6 +196,7 @@ async def run_custom_agent( agent = CustomAgent( task=task, add_infos=add_infos, + use_vision=use_vision, llm=llm, browser_context=browser_context, controller=controller, @@ -245,9 +252,10 @@ def main(): with gr.Row(): agent_type = gr.Radio(["org", "custom"], label="Agent Type", value="custom") max_steps = gr.Number(label="max run steps", value=100) + use_vision = gr.Checkbox(label="use vision", value=True) with gr.Row(): llm_provider = gr.Dropdown( - ["anthropic", "openai", "gemini", "azure_openai"], label="LLM Provider", value="gemini" + ["anthropic", "openai", "gemini", "azure_openai", "deepseek"], label="LLM Provider", value="gemini" ) llm_model_name = gr.Textbox(label="LLM Model Name", value="gemini-2.0-flash-exp") llm_temperature = gr.Number(label="LLM Temperature", value=1.0) @@ -293,7 +301,8 @@ def main(): save_recording_path, task, add_infos, - max_steps + max_steps, + use_vision ], outputs=[final_result_output, errors_output, model_actions_output, model_thoughts_output], )