This commit is contained in:
katiue
2025-01-07 21:44:26 +07:00
7 changed files with 106 additions and 30 deletions

View File

@@ -5,4 +5,56 @@ sdk: gradio
sdk_version: 5.9.1
python_version: 3.12
startup_duration_timeout: 2h
---
---
# Browser-Use WebUI
## Background
This project builds upon the foundation of the [browser-use](https://github.com/browser-use/browser-use), which is designed to make websites accessible for AI agents. We have enhanced the original capabilities by providing:
1. **A Brand New WebUI:** We offer a comprehensive web interface that supports a wide range of `browser-use` functionalities. This UI is designed to be user-friendly and enables easy interaction with the browser agent.
2. **Expanded LLM Support:** We've integrated support for various Large Language Models (LLMs), including: Gemini, OpenAI, Azure OpenAI, Anthropic, DeepSeek, Ollama etc. And we plan to add support for even more models in the future.
3. **Custom Browser Support:** You can use your own browser with our tool, eliminating the need to re-login to sites or deal with other authentication challenges. This feature also supports high-definition screen recording.
4. **Customized Agent:** We've implemented a custom agent that enhances `browser-use` with Optimized prompts.
<video src="https://github.com/user-attachments/assets/58c0f59e-02b4-4413-aba8-6184616bf181" controls="controls" width="500" height="300" >Your browser does not support playing this video!</video>
**Changelog**
- [x] **2025/01/06:** Thanks to @richard-devbot, a New and Well-Designed WebUI is released. [Video tutorial demo](https://github.com/warmshao/browser-use-webui/issues/1#issuecomment-2573393113).
## Environment Installation
1. **Python Version:** Ensure you have Python 3.11 or higher installed.
2. **Install `browser-use`:**
```bash
pip install browser-use
```
3. **Install Playwright:**
```bash
playwright install
```
4. **Install Dependencies:**
```bash
pip install -r requirements.txt
```
5. **Configure Environment Variables:**
- Copy `.env.example` to `.env` and set your environment variables, including API keys for the LLM.
- **If using your own browser:**
- Set `CHROME_PATH` to the executable path of your browser (e.g., `C:\Program Files\Google\Chrome\Application\chrome.exe` on Windows).
- Set `CHROME_USER_DATA` to the user data directory of your browser (e.g.,`C:\Users\<YourUsername>\AppData\Local\Google\Chrome\User Data`).
## Usage
1. **Run the WebUI:**
```bash
python webui.py --ip 127.0.0.1 --port 7788
```
2. **Access the WebUI:** Open your web browser and navigate to `http://127.0.0.1:7788`.
3. **Using Your Own Browser:**
- Close all chrome windows
- Open the WebUI in a non-Chrome browser, such as Firefox or Edge. This is important because the persistent browser context will use the Chrome data when running the agent.
- Check the "Use Own Browser" option within the Browser Settings.

View File

@@ -3,4 +3,6 @@ langchain-google-genai
pyperclip
gradio
python-dotenv
argparse
argparse
langchain-ollama

View File

@@ -82,7 +82,7 @@ class CustomSystemPrompt(SystemPrompt):
- sometimes labels overlap, so use the context to verify the correct element
7. Form filling:
- If you fill a input field and your action sequence is interrupted, most often a list with suggestions poped up under the field and you need to first select the right element from the suggestion list.
- If you fill an input field and your action sequence is interrupted, most often a list with suggestions poped up under the field and you need to first select the right element from the suggestion list.
8. ACTION SEQUENCING:
- Actions are executed in the order they appear in the list

View File

@@ -11,6 +11,7 @@ import os
from langchain_openai import ChatOpenAI, AzureChatOpenAI
from langchain_anthropic import ChatAnthropic
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain_ollama import ChatOllama
def get_llm_model(provider: str, **kwargs):
@@ -39,7 +40,7 @@ def get_llm_model(provider: str, **kwargs):
)
elif provider == 'openai':
if not kwargs.get("base_url", ""):
base_url = "https://api.openai.com/v1"
base_url = os.getenv("OPENAI_ENDPOINT", "https://api.openai.com/v1")
else:
base_url = kwargs.get("base_url")
@@ -66,7 +67,7 @@ def get_llm_model(provider: str, **kwargs):
api_key = kwargs.get("api_key")
return ChatOpenAI(
model=kwargs.get("model_name", 'gpt-4o'),
model=kwargs.get("model_name", 'deepseek-chat'),
temperature=kwargs.get("temperature", 0.0),
base_url=base_url,
api_key=api_key
@@ -81,6 +82,11 @@ def get_llm_model(provider: str, **kwargs):
temperature=kwargs.get("temperature", 0.0),
google_api_key=api_key,
)
elif provider == 'ollama':
return ChatOllama(
model=kwargs.get("model_name", 'qwen2.5:7b'),
temperature=kwargs.get("temperature", 0.0),
)
elif provider == "azure_openai":
if not kwargs.get("base_url", ""):
base_url = os.getenv("AZURE_OPENAI_ENDPOINT", "")

View File

@@ -105,9 +105,15 @@ async def test_browser_use_custom():
# api_key=os.getenv("GOOGLE_API_KEY", "")
# )
# llm = utils.get_llm_model(
# provider="deepseek",
# model_name="deepseek-chat",
# temperature=0.8
# )
llm = utils.get_llm_model(
provider="deepseek",
model_name="deepseek-chat",
provider="ollama",
model_name="qwen2.5:7b",
temperature=0.8
)

View File

@@ -106,7 +106,6 @@ def test_deepseek_model():
base_url=os.getenv("DEEPSEEK_ENDPOINT", ""),
api_key=os.getenv("DEEPSEEK_API_KEY", "")
)
pdb.set_trace()
message = HumanMessage(
content=[
{"type": "text", "text": "who are you?"}
@@ -116,8 +115,17 @@ def test_deepseek_model():
print(ai_msg.content)
def test_ollama_model():
from langchain_ollama import ChatOllama
llm = ChatOllama(model="qwen2.5:7b")
ai_msg = llm.invoke("Sing a ballad of LangChain.")
print(ai_msg.content)
if __name__ == '__main__':
# test_openai_model()
# test_gemini_model()
# test_azure_openai_model()
test_deepseek_model()
# test_deepseek_model()
test_ollama_model()

View File

@@ -46,10 +46,14 @@ async def run_browser_agent(
use_vision,
browser_context=None # Added optional argument
):
"""
Runs the browser agent based on user configurations.
"""
# Ensure the recording directory exists
os.makedirs(save_recording_path, exist_ok=True)
# Get the list of existing videos before the agent runs
existing_videos = set(glob.glob(os.path.join(save_recording_path, '*.[mM][pP]4')) +
glob.glob(os.path.join(save_recording_path, '*.[wW][eE][bB][mM]')))
# Run the agent
llm = utils.get_llm_model(
provider=llm_provider,
model_name=llm_model_name,
@@ -58,7 +62,7 @@ async def run_browser_agent(
api_key=llm_api_key
)
if agent_type == "org":
return await run_org_agent(
final_result, errors, model_actions, model_thoughts = await run_org_agent(
llm=llm,
headless=headless,
disable_security=disable_security,
@@ -71,7 +75,7 @@ async def run_browser_agent(
browser_context=browser_context # pass context
)
elif agent_type == "custom":
return await run_custom_agent(
final_result, errors, model_actions, model_thoughts = await run_custom_agent(
llm=llm,
use_own_browser=use_own_browser,
headless=headless,
@@ -88,6 +92,16 @@ async def run_browser_agent(
else:
raise ValueError(f"Invalid agent type: {agent_type}")
# Get the list of videos after the agent runs
new_videos = set(glob.glob(os.path.join(save_recording_path, '*.[mM][pP]4')) +
glob.glob(os.path.join(save_recording_path, '*.[wW][eE][bB][mM]')))
# Find the newly created video
latest_video = None
if new_videos - existing_videos:
latest_video = list(new_videos - existing_videos)[0] # Get the first new video
return final_result, errors, model_actions, model_thoughts, latest_video
async def run_org_agent(
llm,
@@ -420,22 +434,10 @@ def main():
run_button.click(
fn=run_with_stream,
inputs=[
agent_type,
llm_provider,
llm_model_name,
llm_temperature,
llm_base_url,
llm_api_key,
use_own_browser,
headless,
disable_security,
window_w,
window_h,
save_recording_path,
task,
add_infos,
max_steps,
use_vision
agent_type, llm_provider, llm_model_name, llm_temperature,
llm_base_url, llm_api_key, use_own_browser, headless,
disable_security, window_w, window_h, save_recording_path,
task, add_infos, max_steps, use_vision
],
outputs=[
browser_view,