From 1b52388b535b0e0c3252b006bdfcdd7e95d5e777 Mon Sep 17 00:00:00 2001 From: Yuhang Zhou <1677382760@qq.com> Date: Wed, 4 Feb 2026 22:12:11 +0800 Subject: [PATCH] feat: update document toolkit with crawl4ai --- owl/utils/document_toolkit.py | 31 ++++++++++++++++++++++--------- pyproject.toml | 1 + requirements.txt | 1 + uv.lock | 35 +++++++++++++++++++++++++++++++++++ 4 files changed, 59 insertions(+), 9 deletions(-) diff --git a/owl/utils/document_toolkit.py b/owl/utils/document_toolkit.py index 44480c1..5b4d1cf 100644 --- a/owl/utils/document_toolkit.py +++ b/owl/utils/document_toolkit.py @@ -30,7 +30,8 @@ import subprocess import xmltodict import nest_asyncio import traceback -import html2text +import asyncio +from crawl4ai import AsyncWebCrawler nest_asyncio.apply() @@ -237,20 +238,32 @@ class DocumentProcessingToolkit(BaseToolkit): return str(data["data"][0]["markdown"]) else: - logger.warning("Firecrawl API key is not set. Use html2text to extract the content of the webpage.") - return self._extract_webpage_content_with_html2text(url) + logger.warning("Firecrawl API key is not set. Use crawl4ai to extract the content of the webpage.") + return self._extract_webpage_content_with_crawl4ai(url) - def _extract_webpage_content_with_html2text(self, url: str) -> str: - r"""Extract the content of a webpage using html2text.""" - user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36" + def _extract_webpage_content_with_crawl4ai(self, url: str) -> str: + r"""Extract the content of a webpage using crawl4ai.""" try: - response = requests.get(url, headers={"User-Agent": user_agent}) - response.raise_for_status() - return html2text.html2text(response.text) + # Use asyncio.run to execute the async function + return asyncio.run(self._async_extract_webpage_content_with_crawl4ai(url)) except Exception as e: logger.error(f"Error while extracting the content of the webpage: {e}") return "Error while extracting the content of the webpage." + + async def _async_extract_webpage_content_with_crawl4ai(self, url: str) -> str: + r"""Async helper method to extract webpage content using crawl4ai.""" + try: + async with AsyncWebCrawler(verbose=False) as crawler: + result = await crawler.arun(url=url) + if result.markdown: + return result.markdown + else: + logger.warning("No markdown content extracted from the webpage.") + return "No content found on the webpage." + except Exception as e: + logger.error(f"Error while extracting the content of the webpage with crawl4ai: {e}") + return "Error while extracting the content of the webpage." def _download_file(self, url: str): diff --git a/pyproject.toml b/pyproject.toml index f44aad7..84e72fa 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -28,6 +28,7 @@ dependencies = [ "mcp-server-fetch==2025.1.17", "xmltodict>=0.14.2", "firecrawl>=2.5.3", + "crawl4ai>=0.3.0", "mistralai>=1.7.0", "retry==0.9.2", ] diff --git a/requirements.txt b/requirements.txt index f8bcf75..c50915f 100644 --- a/requirements.txt +++ b/requirements.txt @@ -5,4 +5,5 @@ mcp-simple-arxiv==0.2.2 mcp-server-fetch==2025.1.17 xmltodict>=0.14.2 firecrawl>=2.5.3 +crawl4ai>=0.3.0 retry==0.9.2 \ No newline at end of file diff --git a/uv.lock b/uv.lock index 495560f..28fb9ec 100644 --- a/uv.lock +++ b/uv.lock @@ -700,6 +700,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/b5/0e/d4b7d6a8df5074cf67bc14adead39955b0bf847c947ff6cad0bb527887f4/ddgs-9.10.0-py3-none-any.whl", hash = "sha256:81233d79309836eb03e7df2a0d2697adc83c47c342713132c0ba618f1f2c6eee", size = 40311 }, ] +[[package]] +name = "decorator" +version = "5.2.1" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/43/fa/6d96a0978d19e17b68d634497769987b16c8f4cd0a7a05048bec693caa6b/decorator-5.2.1.tar.gz", hash = "sha256:65f266143752f734b0a7cc83c46f4618af75b8c5911b00ccb61d0ac9b6da0360", size = 56711 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/4e/8c/f3147f5c4b73e7550fe5f9352eaa956ae838d5c51eb58e7a25b9f3e2643b/decorator-5.2.1-py3-none-any.whl", hash = "sha256:d316bb415a2d9e2d2b3abcc4084c6502fc09240e292cd76a76afc106a1c8e04a", size = 9190 }, +] + [[package]] name = "dill" version = "0.3.8" @@ -2087,24 +2096,28 @@ version = "0.0.1" source = { editable = "." } dependencies = [ { name = "camel-ai", extra = ["owl"] }, + { name = "crawl4ai" }, { name = "docx2markdown" }, { name = "firecrawl" }, { name = "gradio" }, { name = "mcp-server-fetch" }, { name = "mcp-simple-arxiv" }, { name = "mistralai" }, + { name = "retry" }, { name = "xmltodict" }, ] [package.metadata] requires-dist = [ { name = "camel-ai", extras = ["owl"], specifier = "==0.2.84" }, + { name = "crawl4ai", specifier = ">=0.3.0" }, { name = "docx2markdown", specifier = ">=0.1.1" }, { name = "firecrawl", specifier = ">=2.5.3" }, { name = "gradio", specifier = ">=3.50.2" }, { name = "mcp-server-fetch", specifier = "==2025.1.17" }, { name = "mcp-simple-arxiv", specifier = "==0.2.2" }, { name = "mistralai", specifier = ">=1.7.0" }, + { name = "retry", specifier = "==0.9.2" }, { name = "xmltodict", specifier = ">=0.14.2" }, ] @@ -2363,6 +2376,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/05/33/2d74d588408caedd065c2497bdb5ef83ce6082db01289a1e1147f6639802/psutil-5.9.8-cp38-abi3-macosx_11_0_arm64.whl", hash = "sha256:d16bbddf0693323b8c6123dd804100241da461e41d6e332fb0ba6058f630f8c8", size = 249898 }, ] +[[package]] +name = "py" +version = "1.11.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/98/ff/fec109ceb715d2a6b4c4a85a61af3b40c723a961e8828319fbcb15b868dc/py-1.11.0.tar.gz", hash = "sha256:51c75c4126074b472f746a24399ad32f6053d1b34b68d2fa41e558e6f4a98719", size = 207796 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/f6/f0/10642828a8dfb741e5f3fbaac830550a518a775c7fff6f04a007259b0548/py-1.11.0-py2.py3-none-any.whl", hash = "sha256:607c53218732647dff4acdfcd50cb62615cedf612e72d1724fb1a0cc6405b378", size = 98708 }, +] + [[package]] name = "pyarrow" version = "20.0.0" @@ -3058,6 +3080,19 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/3f/51/d4db610ef29373b879047326cbf6fa98b6c1969d6f6dc423279de2b1be2c/requests_toolbelt-1.0.0-py2.py3-none-any.whl", hash = "sha256:cccfdd665f0a24fcf4726e690f65639d272bb0637b9b92dfd91a5568ccf6bd06", size = 54481 }, ] +[[package]] +name = "retry" +version = "0.9.2" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "decorator" }, + { name = "py" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/9d/72/75d0b85443fbc8d9f38d08d2b1b67cc184ce35280e4a3813cda2f445f3a4/retry-0.9.2.tar.gz", hash = "sha256:f8bfa8b99b69c4506d6f5bd3b0aabf77f98cdb17f3c9fc3f5ca820033336fba4", size = 6448 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/4b/0d/53aea75710af4528a25ed6837d71d117602b01946b307a3912cb3cfcbcba/retry-0.9.2-py2.py3-none-any.whl", hash = "sha256:ccddf89761fa2c726ab29391837d4327f819ea14d244c232a1d24c67a2f98606", size = 7986 }, +] + [[package]] name = "rfc3339-validator" version = "0.1.4"