feat: update document toolkit with crawl4ai

This commit is contained in:
Yuhang Zhou
2026-02-04 22:12:11 +08:00
parent eb7cb00b74
commit 1b52388b53
4 changed files with 59 additions and 9 deletions

View File

@@ -30,7 +30,8 @@ import subprocess
import xmltodict
import nest_asyncio
import traceback
import html2text
import asyncio
from crawl4ai import AsyncWebCrawler
nest_asyncio.apply()
@@ -237,21 +238,33 @@ class DocumentProcessingToolkit(BaseToolkit):
return str(data["data"][0]["markdown"])
else:
logger.warning("Firecrawl API key is not set. Use html2text to extract the content of the webpage.")
return self._extract_webpage_content_with_html2text(url)
logger.warning("Firecrawl API key is not set. Use crawl4ai to extract the content of the webpage.")
return self._extract_webpage_content_with_crawl4ai(url)
def _extract_webpage_content_with_html2text(self, url: str) -> str:
r"""Extract the content of a webpage using html2text."""
user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
def _extract_webpage_content_with_crawl4ai(self, url: str) -> str:
r"""Extract the content of a webpage using crawl4ai."""
try:
response = requests.get(url, headers={"User-Agent": user_agent})
response.raise_for_status()
return html2text.html2text(response.text)
# Use asyncio.run to execute the async function
return asyncio.run(self._async_extract_webpage_content_with_crawl4ai(url))
except Exception as e:
logger.error(f"Error while extracting the content of the webpage: {e}")
return "Error while extracting the content of the webpage."
async def _async_extract_webpage_content_with_crawl4ai(self, url: str) -> str:
r"""Async helper method to extract webpage content using crawl4ai."""
try:
async with AsyncWebCrawler(verbose=False) as crawler:
result = await crawler.arun(url=url)
if result.markdown:
return result.markdown
else:
logger.warning("No markdown content extracted from the webpage.")
return "No content found on the webpage."
except Exception as e:
logger.error(f"Error while extracting the content of the webpage with crawl4ai: {e}")
return "Error while extracting the content of the webpage."
def _download_file(self, url: str):
r"""Download a file from a URL and save it to the cache directory."""

View File

@@ -28,6 +28,7 @@ dependencies = [
"mcp-server-fetch==2025.1.17",
"xmltodict>=0.14.2",
"firecrawl>=2.5.3",
"crawl4ai>=0.3.0",
"mistralai>=1.7.0",
"retry==0.9.2",
]

View File

@@ -5,4 +5,5 @@ mcp-simple-arxiv==0.2.2
mcp-server-fetch==2025.1.17
xmltodict>=0.14.2
firecrawl>=2.5.3
crawl4ai>=0.3.0
retry==0.9.2

35
uv.lock generated
View File

@@ -700,6 +700,15 @@ wheels = [
{ url = "https://files.pythonhosted.org/packages/b5/0e/d4b7d6a8df5074cf67bc14adead39955b0bf847c947ff6cad0bb527887f4/ddgs-9.10.0-py3-none-any.whl", hash = "sha256:81233d79309836eb03e7df2a0d2697adc83c47c342713132c0ba618f1f2c6eee", size = 40311 },
]
[[package]]
name = "decorator"
version = "5.2.1"
source = { registry = "https://pypi.org/simple" }
sdist = { url = "https://files.pythonhosted.org/packages/43/fa/6d96a0978d19e17b68d634497769987b16c8f4cd0a7a05048bec693caa6b/decorator-5.2.1.tar.gz", hash = "sha256:65f266143752f734b0a7cc83c46f4618af75b8c5911b00ccb61d0ac9b6da0360", size = 56711 }
wheels = [
{ url = "https://files.pythonhosted.org/packages/4e/8c/f3147f5c4b73e7550fe5f9352eaa956ae838d5c51eb58e7a25b9f3e2643b/decorator-5.2.1-py3-none-any.whl", hash = "sha256:d316bb415a2d9e2d2b3abcc4084c6502fc09240e292cd76a76afc106a1c8e04a", size = 9190 },
]
[[package]]
name = "dill"
version = "0.3.8"
@@ -2087,24 +2096,28 @@ version = "0.0.1"
source = { editable = "." }
dependencies = [
{ name = "camel-ai", extra = ["owl"] },
{ name = "crawl4ai" },
{ name = "docx2markdown" },
{ name = "firecrawl" },
{ name = "gradio" },
{ name = "mcp-server-fetch" },
{ name = "mcp-simple-arxiv" },
{ name = "mistralai" },
{ name = "retry" },
{ name = "xmltodict" },
]
[package.metadata]
requires-dist = [
{ name = "camel-ai", extras = ["owl"], specifier = "==0.2.84" },
{ name = "crawl4ai", specifier = ">=0.3.0" },
{ name = "docx2markdown", specifier = ">=0.1.1" },
{ name = "firecrawl", specifier = ">=2.5.3" },
{ name = "gradio", specifier = ">=3.50.2" },
{ name = "mcp-server-fetch", specifier = "==2025.1.17" },
{ name = "mcp-simple-arxiv", specifier = "==0.2.2" },
{ name = "mistralai", specifier = ">=1.7.0" },
{ name = "retry", specifier = "==0.9.2" },
{ name = "xmltodict", specifier = ">=0.14.2" },
]
@@ -2363,6 +2376,15 @@ wheels = [
{ url = "https://files.pythonhosted.org/packages/05/33/2d74d588408caedd065c2497bdb5ef83ce6082db01289a1e1147f6639802/psutil-5.9.8-cp38-abi3-macosx_11_0_arm64.whl", hash = "sha256:d16bbddf0693323b8c6123dd804100241da461e41d6e332fb0ba6058f630f8c8", size = 249898 },
]
[[package]]
name = "py"
version = "1.11.0"
source = { registry = "https://pypi.org/simple" }
sdist = { url = "https://files.pythonhosted.org/packages/98/ff/fec109ceb715d2a6b4c4a85a61af3b40c723a961e8828319fbcb15b868dc/py-1.11.0.tar.gz", hash = "sha256:51c75c4126074b472f746a24399ad32f6053d1b34b68d2fa41e558e6f4a98719", size = 207796 }
wheels = [
{ url = "https://files.pythonhosted.org/packages/f6/f0/10642828a8dfb741e5f3fbaac830550a518a775c7fff6f04a007259b0548/py-1.11.0-py2.py3-none-any.whl", hash = "sha256:607c53218732647dff4acdfcd50cb62615cedf612e72d1724fb1a0cc6405b378", size = 98708 },
]
[[package]]
name = "pyarrow"
version = "20.0.0"
@@ -3058,6 +3080,19 @@ wheels = [
{ url = "https://files.pythonhosted.org/packages/3f/51/d4db610ef29373b879047326cbf6fa98b6c1969d6f6dc423279de2b1be2c/requests_toolbelt-1.0.0-py2.py3-none-any.whl", hash = "sha256:cccfdd665f0a24fcf4726e690f65639d272bb0637b9b92dfd91a5568ccf6bd06", size = 54481 },
]
[[package]]
name = "retry"
version = "0.9.2"
source = { registry = "https://pypi.org/simple" }
dependencies = [
{ name = "decorator" },
{ name = "py" },
]
sdist = { url = "https://files.pythonhosted.org/packages/9d/72/75d0b85443fbc8d9f38d08d2b1b67cc184ce35280e4a3813cda2f445f3a4/retry-0.9.2.tar.gz", hash = "sha256:f8bfa8b99b69c4506d6f5bd3b0aabf77f98cdb17f3c9fc3f5ca820033336fba4", size = 6448 }
wheels = [
{ url = "https://files.pythonhosted.org/packages/4b/0d/53aea75710af4528a25ed6837d71d117602b01946b307a3912cb3cfcbcba/retry-0.9.2-py2.py3-none-any.whl", hash = "sha256:ccddf89761fa2c726ab29391837d4327f819ea14d244c232a1d24c67a2f98606", size = 7986 },
]
[[package]]
name = "rfc3339-validator"
version = "0.1.4"