From fbff7265a8135cf991164b3e594ef414627921f9 Mon Sep 17 00:00:00 2001 From: Sun Tao <2605127667@qq.com> Date: Mon, 14 Apr 2025 11:40:44 +0800 Subject: [PATCH] Update document_toolkit.py --- owl/utils/document_toolkit.py | 71 ++++++----------------------------- 1 file changed, 11 insertions(+), 60 deletions(-) diff --git a/owl/utils/document_toolkit.py b/owl/utils/document_toolkit.py index 5d81ce4..deccda2 100644 --- a/owl/utils/document_toolkit.py +++ b/owl/utils/document_toolkit.py @@ -12,13 +12,13 @@ # limitations under the License. # ========= Copyright 2023-2024 @ CAMEL-AI.org. All Rights Reserved. ========= +from camel.loaders import UnstructuredIO from camel.toolkits.base import BaseToolkit from camel.toolkits.function_tool import FunctionTool from camel.toolkits import ImageAnalysisToolkit, ExcelToolkit from camel.utils import retry_on_error from camel.logger import get_logger from camel.models import BaseModelBackend -from docx2markdown._docx_to_markdown import docx_to_markdown from chunkr_ai import Chunkr import requests import mimetypes @@ -29,6 +29,7 @@ import os import subprocess import xmltodict import nest_asyncio +import traceback nest_asyncio.apply() @@ -52,6 +53,8 @@ class DocumentProcessingToolkit(BaseToolkit): if cache_dir: self.cache_dir = cache_dir + self.uio = UnstructuredIO() + @retry_on_error() def extract_document_content(self, document_path: str) -> Tuple[bool, str]: r"""Extract the content of a given document (or url) and return the processed text. @@ -63,7 +66,6 @@ class DocumentProcessingToolkit(BaseToolkit): Returns: Tuple[bool, str]: A tuple containing a boolean indicating whether the document was processed successfully, and the content of the document (if success). """ - import asyncio logger.debug( f"Calling extract_document_content function with document_path=`{document_path}`" @@ -119,67 +121,16 @@ class DocumentProcessingToolkit(BaseToolkit): return True, extracted_text else: - # judge if url - parsed_url = urlparse(document_path) - is_url = all([parsed_url.scheme, parsed_url.netloc]) - if not is_url: - if not os.path.exists(document_path): - return False, f"Document not found at path: {document_path}." - - # if is docx file, use docx2markdown to convert it - if document_path.endswith(".docx"): - if is_url: - tmp_path = self._download_file(document_path) - else: - tmp_path = document_path - - file_name = os.path.basename(tmp_path) - md_file_path = f"{file_name}.md" - docx_to_markdown(tmp_path, md_file_path) - - # load content of md file - with open(md_file_path, "r") as f: - extracted_text = f.read() - f.close() - return True, extracted_text try: - result = asyncio.run(self._extract_content_with_chunkr(document_path)) - return True, result + elements = self.uio.parse_file_or_url(document_path) + if elements is None: + logger.error(f"Failed to parse the document: {document_path}.") + return False, f"Failed to parse the document: {document_path}." + else: + return True, elements except Exception as e: - logger.warning( - f"Error occurred while using Chunkr to process document: {e}" - ) - if document_path.endswith(".pdf"): - # try using pypdf to extract text from pdf - try: - from PyPDF2 import PdfReader - - if is_url: - tmp_path = self._download_file(document_path) - document_path = tmp_path - - # Open file in binary mode for PdfReader - f = open(document_path, "rb") - reader = PdfReader(f) - extracted_text = "" - for page in reader.pages: - extracted_text += page.extract_text() - f.close() - - return True, extracted_text - - except Exception as pdf_error: - logger.error( - f"Error occurred while processing pdf: {pdf_error}" - ) - return ( - False, - f"Error occurred while processing pdf: {pdf_error}", - ) - - # If we get here, either it's not a PDF or PDF processing failed - logger.error(f"Error occurred while processing document: {e}") + logger.error(traceback.format_exc()) return False, f"Error occurred while processing document: {e}" def _is_webpage(self, url: str) -> bool: