From 07b811019e75bc209be98b5d8133b4bb6d8541e6 Mon Sep 17 00:00:00 2001 From: Tao Sun <168447269+fengju0213@users.noreply.github.com> Date: Tue, 22 Apr 2025 12:19:24 +0800 Subject: [PATCH] Update document_toolkit.py --- owl/utils/document_toolkit.py | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/owl/utils/document_toolkit.py b/owl/utils/document_toolkit.py index deccda2..f13faee 100644 --- a/owl/utils/document_toolkit.py +++ b/owl/utils/document_toolkit.py @@ -117,8 +117,19 @@ class DocumentProcessingToolkit(BaseToolkit): return True, content if self._is_webpage(document_path): - extracted_text = self._extract_webpage_content(document_path) - return True, extracted_text + try: + extracted_text = self._extract_webpage_content(document_path) + return True, extracted_text + except Exception: + try: + elements = self.uio.parse_file_or_url(document_path) + if elements is None: + logger.error(f"Failed to parse the document: {document_path}.") + return False, f"Failed to parse the document: {document_path}." + else: + return True, elements + except Exception: + return False, "Failed to extract content from the webpage." else: try: