Revert "Update document_toolkit.py"

This reverts commit a2cd3b28d5.
2026-03-22 14:07:17 +08:00 · 2025-04-14 11:35:55 +08:00
parent a2cd3b28d5
commit 562a620082
1 changed files with 60 additions and 11 deletions
--- a/owl/utils/document_toolkit.py
+++ b/owl/utils/document_toolkit.py
@@ -12,13 +12,13 @@
 # limitations under the License.
 # ========= Copyright 2023-2024 @ CAMEL-AI.org. All Rights Reserved. =========

-from camel.loaders import UnstructuredIO
 from camel.toolkits.base import BaseToolkit
 from camel.toolkits.function_tool import FunctionTool
 from camel.toolkits import ImageAnalysisToolkit, ExcelToolkit
 from camel.utils import retry_on_error
 from camel.logger import get_logger
 from camel.models import BaseModelBackend
+from docx2markdown._docx_to_markdown import docx_to_markdown
 from chunkr_ai import Chunkr
 import requests
 import mimetypes
@@ -29,7 +29,6 @@ import os
 import subprocess
 import xmltodict
 import nest_asyncio
-import traceback

 nest_asyncio.apply()

@@ -53,8 +52,6 @@ class DocumentProcessingToolkit(BaseToolkit):
        if cache_dir:
            self.cache_dir = cache_dir

-        self.uio = UnstructuredIO()
-
    @retry_on_error()
    def extract_document_content(self, document_path: str) -> Tuple[bool, str]:
        r"""Extract the content of a given document (or url) and return the processed text.
@@ -66,6 +63,7 @@ class DocumentProcessingToolkit(BaseToolkit):
        Returns:
            Tuple[bool, str]: A tuple containing a boolean indicating whether the document was processed successfully, and the content of the document (if success).
        """
+        import asyncio

        logger.debug(
            f"Calling extract_document_content function with document_path=`{document_path}`"
@@ -121,16 +119,67 @@ class DocumentProcessingToolkit(BaseToolkit):
            return True, extracted_text

        else:
-            try:
-                elements = self.uio.parse_file_or_url(document_path)
-                if elements is None:
-                    logger.error(f"Failed to parse the document: {document_path}.")
-                    return False, f"Failed to parse the document: {document_path}."
+            # judge if url
+            parsed_url = urlparse(document_path)
+            is_url = all([parsed_url.scheme, parsed_url.netloc])
+            if not is_url:
+                if not os.path.exists(document_path):
+                    return False, f"Document not found at path: {document_path}."
+
+            # if is docx file, use docx2markdown to convert it
+            if document_path.endswith(".docx"):
+                if is_url:
+                    tmp_path = self._download_file(document_path)
                else:
-                    return True, elements
+                    tmp_path = document_path
+
+                file_name = os.path.basename(tmp_path)
+                md_file_path = f"{file_name}.md"
+                docx_to_markdown(tmp_path, md_file_path)
+
+                # load content of md file
+                with open(md_file_path, "r") as f:
+                    extracted_text = f.read()
+                f.close()
+                return True, extracted_text
+            try:
+                result = asyncio.run(self._extract_content_with_chunkr(document_path))
+                return True, result

            except Exception as e:
-                logger.error(traceback.format_exc())
+                logger.warning(
+                    f"Error occurred while using Chunkr to process document: {e}"
+                )
+                if document_path.endswith(".pdf"):
+                    # try using pypdf to extract text from pdf
+                    try:
+                        from PyPDF2 import PdfReader
+
+                        if is_url:
+                            tmp_path = self._download_file(document_path)
+                            document_path = tmp_path
+
+                        # Open file in binary mode for PdfReader
+                        f = open(document_path, "rb")
+                        reader = PdfReader(f)
+                        extracted_text = ""
+                        for page in reader.pages:
+                            extracted_text += page.extract_text()
+                        f.close()
+
+                        return True, extracted_text
+
+                    except Exception as pdf_error:
+                        logger.error(
+                            f"Error occurred while processing pdf: {pdf_error}"
+                        )
+                        return (
+                            False,
+                            f"Error occurred while processing pdf: {pdf_error}",
+                        )
+
+                # If we get here, either it's not a PDF or PDF processing failed
+                logger.error(f"Error occurred while processing document: {e}")
                return False, f"Error occurred while processing document: {e}"

    def _is_webpage(self, url: str) -> bool: