mirror of
https://github.com/camel-ai/owl.git
synced 2026-03-22 05:57:17 +08:00
feat: update document toolkit with crawl4ai
This commit is contained in:
@@ -30,7 +30,8 @@ import subprocess
|
|||||||
import xmltodict
|
import xmltodict
|
||||||
import nest_asyncio
|
import nest_asyncio
|
||||||
import traceback
|
import traceback
|
||||||
import html2text
|
import asyncio
|
||||||
|
from crawl4ai import AsyncWebCrawler
|
||||||
|
|
||||||
nest_asyncio.apply()
|
nest_asyncio.apply()
|
||||||
|
|
||||||
@@ -237,20 +238,32 @@ class DocumentProcessingToolkit(BaseToolkit):
|
|||||||
|
|
||||||
return str(data["data"][0]["markdown"])
|
return str(data["data"][0]["markdown"])
|
||||||
else:
|
else:
|
||||||
logger.warning("Firecrawl API key is not set. Use html2text to extract the content of the webpage.")
|
logger.warning("Firecrawl API key is not set. Use crawl4ai to extract the content of the webpage.")
|
||||||
return self._extract_webpage_content_with_html2text(url)
|
return self._extract_webpage_content_with_crawl4ai(url)
|
||||||
|
|
||||||
|
|
||||||
def _extract_webpage_content_with_html2text(self, url: str) -> str:
|
def _extract_webpage_content_with_crawl4ai(self, url: str) -> str:
|
||||||
r"""Extract the content of a webpage using html2text."""
|
r"""Extract the content of a webpage using crawl4ai."""
|
||||||
user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
|
|
||||||
try:
|
try:
|
||||||
response = requests.get(url, headers={"User-Agent": user_agent})
|
# Use asyncio.run to execute the async function
|
||||||
response.raise_for_status()
|
return asyncio.run(self._async_extract_webpage_content_with_crawl4ai(url))
|
||||||
return html2text.html2text(response.text)
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(f"Error while extracting the content of the webpage: {e}")
|
logger.error(f"Error while extracting the content of the webpage: {e}")
|
||||||
return "Error while extracting the content of the webpage."
|
return "Error while extracting the content of the webpage."
|
||||||
|
|
||||||
|
async def _async_extract_webpage_content_with_crawl4ai(self, url: str) -> str:
|
||||||
|
r"""Async helper method to extract webpage content using crawl4ai."""
|
||||||
|
try:
|
||||||
|
async with AsyncWebCrawler(verbose=False) as crawler:
|
||||||
|
result = await crawler.arun(url=url)
|
||||||
|
if result.markdown:
|
||||||
|
return result.markdown
|
||||||
|
else:
|
||||||
|
logger.warning("No markdown content extracted from the webpage.")
|
||||||
|
return "No content found on the webpage."
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Error while extracting the content of the webpage with crawl4ai: {e}")
|
||||||
|
return "Error while extracting the content of the webpage."
|
||||||
|
|
||||||
|
|
||||||
def _download_file(self, url: str):
|
def _download_file(self, url: str):
|
||||||
|
|||||||
@@ -28,6 +28,7 @@ dependencies = [
|
|||||||
"mcp-server-fetch==2025.1.17",
|
"mcp-server-fetch==2025.1.17",
|
||||||
"xmltodict>=0.14.2",
|
"xmltodict>=0.14.2",
|
||||||
"firecrawl>=2.5.3",
|
"firecrawl>=2.5.3",
|
||||||
|
"crawl4ai>=0.3.0",
|
||||||
"mistralai>=1.7.0",
|
"mistralai>=1.7.0",
|
||||||
"retry==0.9.2",
|
"retry==0.9.2",
|
||||||
]
|
]
|
||||||
|
|||||||
@@ -5,4 +5,5 @@ mcp-simple-arxiv==0.2.2
|
|||||||
mcp-server-fetch==2025.1.17
|
mcp-server-fetch==2025.1.17
|
||||||
xmltodict>=0.14.2
|
xmltodict>=0.14.2
|
||||||
firecrawl>=2.5.3
|
firecrawl>=2.5.3
|
||||||
|
crawl4ai>=0.3.0
|
||||||
retry==0.9.2
|
retry==0.9.2
|
||||||
35
uv.lock
generated
35
uv.lock
generated
@@ -700,6 +700,15 @@ wheels = [
|
|||||||
{ url = "https://files.pythonhosted.org/packages/b5/0e/d4b7d6a8df5074cf67bc14adead39955b0bf847c947ff6cad0bb527887f4/ddgs-9.10.0-py3-none-any.whl", hash = "sha256:81233d79309836eb03e7df2a0d2697adc83c47c342713132c0ba618f1f2c6eee", size = 40311 },
|
{ url = "https://files.pythonhosted.org/packages/b5/0e/d4b7d6a8df5074cf67bc14adead39955b0bf847c947ff6cad0bb527887f4/ddgs-9.10.0-py3-none-any.whl", hash = "sha256:81233d79309836eb03e7df2a0d2697adc83c47c342713132c0ba618f1f2c6eee", size = 40311 },
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "decorator"
|
||||||
|
version = "5.2.1"
|
||||||
|
source = { registry = "https://pypi.org/simple" }
|
||||||
|
sdist = { url = "https://files.pythonhosted.org/packages/43/fa/6d96a0978d19e17b68d634497769987b16c8f4cd0a7a05048bec693caa6b/decorator-5.2.1.tar.gz", hash = "sha256:65f266143752f734b0a7cc83c46f4618af75b8c5911b00ccb61d0ac9b6da0360", size = 56711 }
|
||||||
|
wheels = [
|
||||||
|
{ url = "https://files.pythonhosted.org/packages/4e/8c/f3147f5c4b73e7550fe5f9352eaa956ae838d5c51eb58e7a25b9f3e2643b/decorator-5.2.1-py3-none-any.whl", hash = "sha256:d316bb415a2d9e2d2b3abcc4084c6502fc09240e292cd76a76afc106a1c8e04a", size = 9190 },
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "dill"
|
name = "dill"
|
||||||
version = "0.3.8"
|
version = "0.3.8"
|
||||||
@@ -2087,24 +2096,28 @@ version = "0.0.1"
|
|||||||
source = { editable = "." }
|
source = { editable = "." }
|
||||||
dependencies = [
|
dependencies = [
|
||||||
{ name = "camel-ai", extra = ["owl"] },
|
{ name = "camel-ai", extra = ["owl"] },
|
||||||
|
{ name = "crawl4ai" },
|
||||||
{ name = "docx2markdown" },
|
{ name = "docx2markdown" },
|
||||||
{ name = "firecrawl" },
|
{ name = "firecrawl" },
|
||||||
{ name = "gradio" },
|
{ name = "gradio" },
|
||||||
{ name = "mcp-server-fetch" },
|
{ name = "mcp-server-fetch" },
|
||||||
{ name = "mcp-simple-arxiv" },
|
{ name = "mcp-simple-arxiv" },
|
||||||
{ name = "mistralai" },
|
{ name = "mistralai" },
|
||||||
|
{ name = "retry" },
|
||||||
{ name = "xmltodict" },
|
{ name = "xmltodict" },
|
||||||
]
|
]
|
||||||
|
|
||||||
[package.metadata]
|
[package.metadata]
|
||||||
requires-dist = [
|
requires-dist = [
|
||||||
{ name = "camel-ai", extras = ["owl"], specifier = "==0.2.84" },
|
{ name = "camel-ai", extras = ["owl"], specifier = "==0.2.84" },
|
||||||
|
{ name = "crawl4ai", specifier = ">=0.3.0" },
|
||||||
{ name = "docx2markdown", specifier = ">=0.1.1" },
|
{ name = "docx2markdown", specifier = ">=0.1.1" },
|
||||||
{ name = "firecrawl", specifier = ">=2.5.3" },
|
{ name = "firecrawl", specifier = ">=2.5.3" },
|
||||||
{ name = "gradio", specifier = ">=3.50.2" },
|
{ name = "gradio", specifier = ">=3.50.2" },
|
||||||
{ name = "mcp-server-fetch", specifier = "==2025.1.17" },
|
{ name = "mcp-server-fetch", specifier = "==2025.1.17" },
|
||||||
{ name = "mcp-simple-arxiv", specifier = "==0.2.2" },
|
{ name = "mcp-simple-arxiv", specifier = "==0.2.2" },
|
||||||
{ name = "mistralai", specifier = ">=1.7.0" },
|
{ name = "mistralai", specifier = ">=1.7.0" },
|
||||||
|
{ name = "retry", specifier = "==0.9.2" },
|
||||||
{ name = "xmltodict", specifier = ">=0.14.2" },
|
{ name = "xmltodict", specifier = ">=0.14.2" },
|
||||||
]
|
]
|
||||||
|
|
||||||
@@ -2363,6 +2376,15 @@ wheels = [
|
|||||||
{ url = "https://files.pythonhosted.org/packages/05/33/2d74d588408caedd065c2497bdb5ef83ce6082db01289a1e1147f6639802/psutil-5.9.8-cp38-abi3-macosx_11_0_arm64.whl", hash = "sha256:d16bbddf0693323b8c6123dd804100241da461e41d6e332fb0ba6058f630f8c8", size = 249898 },
|
{ url = "https://files.pythonhosted.org/packages/05/33/2d74d588408caedd065c2497bdb5ef83ce6082db01289a1e1147f6639802/psutil-5.9.8-cp38-abi3-macosx_11_0_arm64.whl", hash = "sha256:d16bbddf0693323b8c6123dd804100241da461e41d6e332fb0ba6058f630f8c8", size = 249898 },
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "py"
|
||||||
|
version = "1.11.0"
|
||||||
|
source = { registry = "https://pypi.org/simple" }
|
||||||
|
sdist = { url = "https://files.pythonhosted.org/packages/98/ff/fec109ceb715d2a6b4c4a85a61af3b40c723a961e8828319fbcb15b868dc/py-1.11.0.tar.gz", hash = "sha256:51c75c4126074b472f746a24399ad32f6053d1b34b68d2fa41e558e6f4a98719", size = 207796 }
|
||||||
|
wheels = [
|
||||||
|
{ url = "https://files.pythonhosted.org/packages/f6/f0/10642828a8dfb741e5f3fbaac830550a518a775c7fff6f04a007259b0548/py-1.11.0-py2.py3-none-any.whl", hash = "sha256:607c53218732647dff4acdfcd50cb62615cedf612e72d1724fb1a0cc6405b378", size = 98708 },
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "pyarrow"
|
name = "pyarrow"
|
||||||
version = "20.0.0"
|
version = "20.0.0"
|
||||||
@@ -3058,6 +3080,19 @@ wheels = [
|
|||||||
{ url = "https://files.pythonhosted.org/packages/3f/51/d4db610ef29373b879047326cbf6fa98b6c1969d6f6dc423279de2b1be2c/requests_toolbelt-1.0.0-py2.py3-none-any.whl", hash = "sha256:cccfdd665f0a24fcf4726e690f65639d272bb0637b9b92dfd91a5568ccf6bd06", size = 54481 },
|
{ url = "https://files.pythonhosted.org/packages/3f/51/d4db610ef29373b879047326cbf6fa98b6c1969d6f6dc423279de2b1be2c/requests_toolbelt-1.0.0-py2.py3-none-any.whl", hash = "sha256:cccfdd665f0a24fcf4726e690f65639d272bb0637b9b92dfd91a5568ccf6bd06", size = 54481 },
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "retry"
|
||||||
|
version = "0.9.2"
|
||||||
|
source = { registry = "https://pypi.org/simple" }
|
||||||
|
dependencies = [
|
||||||
|
{ name = "decorator" },
|
||||||
|
{ name = "py" },
|
||||||
|
]
|
||||||
|
sdist = { url = "https://files.pythonhosted.org/packages/9d/72/75d0b85443fbc8d9f38d08d2b1b67cc184ce35280e4a3813cda2f445f3a4/retry-0.9.2.tar.gz", hash = "sha256:f8bfa8b99b69c4506d6f5bd3b0aabf77f98cdb17f3c9fc3f5ca820033336fba4", size = 6448 }
|
||||||
|
wheels = [
|
||||||
|
{ url = "https://files.pythonhosted.org/packages/4b/0d/53aea75710af4528a25ed6837d71d117602b01946b307a3912cb3cfcbcba/retry-0.9.2-py2.py3-none-any.whl", hash = "sha256:ccddf89761fa2c726ab29391837d4327f819ea14d244c232a1d24c67a2f98606", size = 7986 },
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "rfc3339-validator"
|
name = "rfc3339-validator"
|
||||||
version = "0.1.4"
|
version = "0.1.4"
|
||||||
|
|||||||
Reference in New Issue
Block a user