feat: update document toolkit with crawl4ai

This commit is contained in:
Yuhang Zhou
2026-02-04 22:12:11 +08:00
parent eb7cb00b74
commit 1b52388b53
4 changed files with 59 additions and 9 deletions

View File

@@ -30,7 +30,8 @@ import subprocess
import xmltodict import xmltodict
import nest_asyncio import nest_asyncio
import traceback import traceback
import html2text import asyncio
from crawl4ai import AsyncWebCrawler
nest_asyncio.apply() nest_asyncio.apply()
@@ -237,20 +238,32 @@ class DocumentProcessingToolkit(BaseToolkit):
return str(data["data"][0]["markdown"]) return str(data["data"][0]["markdown"])
else: else:
logger.warning("Firecrawl API key is not set. Use html2text to extract the content of the webpage.") logger.warning("Firecrawl API key is not set. Use crawl4ai to extract the content of the webpage.")
return self._extract_webpage_content_with_html2text(url) return self._extract_webpage_content_with_crawl4ai(url)
def _extract_webpage_content_with_html2text(self, url: str) -> str: def _extract_webpage_content_with_crawl4ai(self, url: str) -> str:
r"""Extract the content of a webpage using html2text.""" r"""Extract the content of a webpage using crawl4ai."""
user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
try: try:
response = requests.get(url, headers={"User-Agent": user_agent}) # Use asyncio.run to execute the async function
response.raise_for_status() return asyncio.run(self._async_extract_webpage_content_with_crawl4ai(url))
return html2text.html2text(response.text)
except Exception as e: except Exception as e:
logger.error(f"Error while extracting the content of the webpage: {e}") logger.error(f"Error while extracting the content of the webpage: {e}")
return "Error while extracting the content of the webpage." return "Error while extracting the content of the webpage."
async def _async_extract_webpage_content_with_crawl4ai(self, url: str) -> str:
r"""Async helper method to extract webpage content using crawl4ai."""
try:
async with AsyncWebCrawler(verbose=False) as crawler:
result = await crawler.arun(url=url)
if result.markdown:
return result.markdown
else:
logger.warning("No markdown content extracted from the webpage.")
return "No content found on the webpage."
except Exception as e:
logger.error(f"Error while extracting the content of the webpage with crawl4ai: {e}")
return "Error while extracting the content of the webpage."
def _download_file(self, url: str): def _download_file(self, url: str):

View File

@@ -28,6 +28,7 @@ dependencies = [
"mcp-server-fetch==2025.1.17", "mcp-server-fetch==2025.1.17",
"xmltodict>=0.14.2", "xmltodict>=0.14.2",
"firecrawl>=2.5.3", "firecrawl>=2.5.3",
"crawl4ai>=0.3.0",
"mistralai>=1.7.0", "mistralai>=1.7.0",
"retry==0.9.2", "retry==0.9.2",
] ]

View File

@@ -5,4 +5,5 @@ mcp-simple-arxiv==0.2.2
mcp-server-fetch==2025.1.17 mcp-server-fetch==2025.1.17
xmltodict>=0.14.2 xmltodict>=0.14.2
firecrawl>=2.5.3 firecrawl>=2.5.3
crawl4ai>=0.3.0
retry==0.9.2 retry==0.9.2

35
uv.lock generated
View File

@@ -700,6 +700,15 @@ wheels = [
{ url = "https://files.pythonhosted.org/packages/b5/0e/d4b7d6a8df5074cf67bc14adead39955b0bf847c947ff6cad0bb527887f4/ddgs-9.10.0-py3-none-any.whl", hash = "sha256:81233d79309836eb03e7df2a0d2697adc83c47c342713132c0ba618f1f2c6eee", size = 40311 }, { url = "https://files.pythonhosted.org/packages/b5/0e/d4b7d6a8df5074cf67bc14adead39955b0bf847c947ff6cad0bb527887f4/ddgs-9.10.0-py3-none-any.whl", hash = "sha256:81233d79309836eb03e7df2a0d2697adc83c47c342713132c0ba618f1f2c6eee", size = 40311 },
] ]
[[package]]
name = "decorator"
version = "5.2.1"
source = { registry = "https://pypi.org/simple" }
sdist = { url = "https://files.pythonhosted.org/packages/43/fa/6d96a0978d19e17b68d634497769987b16c8f4cd0a7a05048bec693caa6b/decorator-5.2.1.tar.gz", hash = "sha256:65f266143752f734b0a7cc83c46f4618af75b8c5911b00ccb61d0ac9b6da0360", size = 56711 }
wheels = [
{ url = "https://files.pythonhosted.org/packages/4e/8c/f3147f5c4b73e7550fe5f9352eaa956ae838d5c51eb58e7a25b9f3e2643b/decorator-5.2.1-py3-none-any.whl", hash = "sha256:d316bb415a2d9e2d2b3abcc4084c6502fc09240e292cd76a76afc106a1c8e04a", size = 9190 },
]
[[package]] [[package]]
name = "dill" name = "dill"
version = "0.3.8" version = "0.3.8"
@@ -2087,24 +2096,28 @@ version = "0.0.1"
source = { editable = "." } source = { editable = "." }
dependencies = [ dependencies = [
{ name = "camel-ai", extra = ["owl"] }, { name = "camel-ai", extra = ["owl"] },
{ name = "crawl4ai" },
{ name = "docx2markdown" }, { name = "docx2markdown" },
{ name = "firecrawl" }, { name = "firecrawl" },
{ name = "gradio" }, { name = "gradio" },
{ name = "mcp-server-fetch" }, { name = "mcp-server-fetch" },
{ name = "mcp-simple-arxiv" }, { name = "mcp-simple-arxiv" },
{ name = "mistralai" }, { name = "mistralai" },
{ name = "retry" },
{ name = "xmltodict" }, { name = "xmltodict" },
] ]
[package.metadata] [package.metadata]
requires-dist = [ requires-dist = [
{ name = "camel-ai", extras = ["owl"], specifier = "==0.2.84" }, { name = "camel-ai", extras = ["owl"], specifier = "==0.2.84" },
{ name = "crawl4ai", specifier = ">=0.3.0" },
{ name = "docx2markdown", specifier = ">=0.1.1" }, { name = "docx2markdown", specifier = ">=0.1.1" },
{ name = "firecrawl", specifier = ">=2.5.3" }, { name = "firecrawl", specifier = ">=2.5.3" },
{ name = "gradio", specifier = ">=3.50.2" }, { name = "gradio", specifier = ">=3.50.2" },
{ name = "mcp-server-fetch", specifier = "==2025.1.17" }, { name = "mcp-server-fetch", specifier = "==2025.1.17" },
{ name = "mcp-simple-arxiv", specifier = "==0.2.2" }, { name = "mcp-simple-arxiv", specifier = "==0.2.2" },
{ name = "mistralai", specifier = ">=1.7.0" }, { name = "mistralai", specifier = ">=1.7.0" },
{ name = "retry", specifier = "==0.9.2" },
{ name = "xmltodict", specifier = ">=0.14.2" }, { name = "xmltodict", specifier = ">=0.14.2" },
] ]
@@ -2363,6 +2376,15 @@ wheels = [
{ url = "https://files.pythonhosted.org/packages/05/33/2d74d588408caedd065c2497bdb5ef83ce6082db01289a1e1147f6639802/psutil-5.9.8-cp38-abi3-macosx_11_0_arm64.whl", hash = "sha256:d16bbddf0693323b8c6123dd804100241da461e41d6e332fb0ba6058f630f8c8", size = 249898 }, { url = "https://files.pythonhosted.org/packages/05/33/2d74d588408caedd065c2497bdb5ef83ce6082db01289a1e1147f6639802/psutil-5.9.8-cp38-abi3-macosx_11_0_arm64.whl", hash = "sha256:d16bbddf0693323b8c6123dd804100241da461e41d6e332fb0ba6058f630f8c8", size = 249898 },
] ]
[[package]]
name = "py"
version = "1.11.0"
source = { registry = "https://pypi.org/simple" }
sdist = { url = "https://files.pythonhosted.org/packages/98/ff/fec109ceb715d2a6b4c4a85a61af3b40c723a961e8828319fbcb15b868dc/py-1.11.0.tar.gz", hash = "sha256:51c75c4126074b472f746a24399ad32f6053d1b34b68d2fa41e558e6f4a98719", size = 207796 }
wheels = [
{ url = "https://files.pythonhosted.org/packages/f6/f0/10642828a8dfb741e5f3fbaac830550a518a775c7fff6f04a007259b0548/py-1.11.0-py2.py3-none-any.whl", hash = "sha256:607c53218732647dff4acdfcd50cb62615cedf612e72d1724fb1a0cc6405b378", size = 98708 },
]
[[package]] [[package]]
name = "pyarrow" name = "pyarrow"
version = "20.0.0" version = "20.0.0"
@@ -3058,6 +3080,19 @@ wheels = [
{ url = "https://files.pythonhosted.org/packages/3f/51/d4db610ef29373b879047326cbf6fa98b6c1969d6f6dc423279de2b1be2c/requests_toolbelt-1.0.0-py2.py3-none-any.whl", hash = "sha256:cccfdd665f0a24fcf4726e690f65639d272bb0637b9b92dfd91a5568ccf6bd06", size = 54481 }, { url = "https://files.pythonhosted.org/packages/3f/51/d4db610ef29373b879047326cbf6fa98b6c1969d6f6dc423279de2b1be2c/requests_toolbelt-1.0.0-py2.py3-none-any.whl", hash = "sha256:cccfdd665f0a24fcf4726e690f65639d272bb0637b9b92dfd91a5568ccf6bd06", size = 54481 },
] ]
[[package]]
name = "retry"
version = "0.9.2"
source = { registry = "https://pypi.org/simple" }
dependencies = [
{ name = "decorator" },
{ name = "py" },
]
sdist = { url = "https://files.pythonhosted.org/packages/9d/72/75d0b85443fbc8d9f38d08d2b1b67cc184ce35280e4a3813cda2f445f3a4/retry-0.9.2.tar.gz", hash = "sha256:f8bfa8b99b69c4506d6f5bd3b0aabf77f98cdb17f3c9fc3f5ca820033336fba4", size = 6448 }
wheels = [
{ url = "https://files.pythonhosted.org/packages/4b/0d/53aea75710af4528a25ed6837d71d117602b01946b307a3912cb3cfcbcba/retry-0.9.2-py2.py3-none-any.whl", hash = "sha256:ccddf89761fa2c726ab29391837d4327f819ea14d244c232a1d24c67a2f98606", size = 7986 },
]
[[package]] [[package]]
name = "rfc3339-validator" name = "rfc3339-validator"
version = "0.1.4" version = "0.1.4"