From 256e263cff61a87218948790761a6724693166e9 Mon Sep 17 00:00:00 2001
From: tigermren <tigeren@live.com>
Date: Sun, 17 Aug 2025 23:12:45 +0800
Subject: [PATCH] =?UTF-8?q?feat:=20=E5=BC=80=E5=90=AFdocx=E8=A7=A3?=
 =?UTF-8?q?=E6=9E=90=EF=BC=8C=E4=BD=86=E6=98=AFmineru-api=E6=9C=AA?=
 =?UTF-8?q?=E6=94=AF=E6=8C=81?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../document_handlers/document_factory.py     |   6 +-
 .../document_handlers/processors/__init__.py  |   5 +-
 .../processors/docx_processor.py              | 219 ++++++++++++++++++
 .../processors/docx_processor.py.backup       |  77 ------
 .../processors/pdf_processor.py               |  32 ++-
 backend/app/core/services/document_service.py |   5 +-
 backend/app/services/file_service.py          |   2 +
 docker-compose.yml                            |   2 -
 8 files changed, 250 insertions(+), 98 deletions(-)
 create mode 100644 backend/app/core/document_handlers/processors/docx_processor.py
 delete mode 100644 backend/app/core/document_handlers/processors/docx_processor.py.backup

diff --git a/backend/app/core/document_handlers/document_factory.py b/backend/app/core/document_handlers/document_factory.py
index 530f536..cb2a73f 100644
--- a/backend/app/core/document_handlers/document_factory.py
+++ b/backend/app/core/document_handlers/document_factory.py
@@ -3,7 +3,7 @@ from typing import Optional
 from .document_processor import DocumentProcessor
 from .processors import (
     TxtDocumentProcessor,
-    # DocxDocumentProcessor,
+    DocxDocumentProcessor,
     PdfDocumentProcessor,
     MarkdownDocumentProcessor
 )
@@ -15,8 +15,8 @@ class DocumentProcessorFactory:
         
         processors = {
             '.txt': TxtDocumentProcessor,
-            # '.docx': DocxDocumentProcessor,
-            # '.doc': DocxDocumentProcessor,
+            '.docx': DocxDocumentProcessor,
+            '.doc': DocxDocumentProcessor,
             '.pdf': PdfDocumentProcessor,
             '.md': MarkdownDocumentProcessor,
             '.markdown': MarkdownDocumentProcessor
diff --git a/backend/app/core/document_handlers/processors/__init__.py b/backend/app/core/document_handlers/processors/__init__.py
index fd143d5..d8d35f0 100644
--- a/backend/app/core/document_handlers/processors/__init__.py
+++ b/backend/app/core/document_handlers/processors/__init__.py
@@ -1,7 +1,6 @@
 from .txt_processor import TxtDocumentProcessor
-# from .docx_processor import DocxDocumentProcessor
+from .docx_processor import DocxDocumentProcessor
 from .pdf_processor import PdfDocumentProcessor
 from .md_processor import MarkdownDocumentProcessor
 
-# __all__ = ['TxtDocumentProcessor', 'DocxDocumentProcessor', 'PdfDocumentProcessor', 'MarkdownDocumentProcessor']
-__all__ = ['TxtDocumentProcessor', 'PdfDocumentProcessor', 'MarkdownDocumentProcessor']
\ No newline at end of file
+__all__ = ['TxtDocumentProcessor', 'DocxDocumentProcessor', 'PdfDocumentProcessor', 'MarkdownDocumentProcessor']
\ No newline at end of file
diff --git a/backend/app/core/document_handlers/processors/docx_processor.py b/backend/app/core/document_handlers/processors/docx_processor.py
new file mode 100644
index 0000000..c451834
--- /dev/null
+++ b/backend/app/core/document_handlers/processors/docx_processor.py
@@ -0,0 +1,219 @@
+import os
+import requests
+import logging
+from typing import Dict, Any, Optional
+from ...document_handlers.document_processor import DocumentProcessor
+from ...services.ollama_client import OllamaClient
+from ...config import settings
+
+logger = logging.getLogger(__name__)
+
+class DocxDocumentProcessor(DocumentProcessor):
+    def __init__(self, input_path: str, output_path: str):
+        super().__init__()  # Call parent class's __init__
+        self.input_path = input_path
+        self.output_path = output_path
+        self.output_dir = os.path.dirname(output_path)
+        self.name_without_suff = os.path.splitext(os.path.basename(input_path))[0]
+        
+        # Setup work directory for temporary files
+        self.work_dir = os.path.join(
+            os.path.dirname(output_path), 
+            ".work", 
+            os.path.splitext(os.path.basename(input_path))[0]
+        )
+        os.makedirs(self.work_dir, exist_ok=True)
+        
+        self.ollama_client = OllamaClient(model_name=settings.OLLAMA_MODEL, base_url=settings.OLLAMA_API_URL)
+        
+        # Mineru API configuration
+        self.mineru_base_url = getattr(settings, 'MINERU_API_URL', 'http://mineru-api:8000')
+        self.mineru_timeout = getattr(settings, 'MINERU_TIMEOUT', 300)  # 5 minutes timeout
+        self.mineru_lang_list = getattr(settings, 'MINERU_LANG_LIST', ['ch'])
+        self.mineru_backend = getattr(settings, 'MINERU_BACKEND', 'pipeline')
+        self.mineru_parse_method = getattr(settings, 'MINERU_PARSE_METHOD', 'auto')
+        self.mineru_formula_enable = getattr(settings, 'MINERU_FORMULA_ENABLE', True)
+        self.mineru_table_enable = getattr(settings, 'MINERU_TABLE_ENABLE', True)
+
+    def _call_mineru_api(self, file_path: str) -> Optional[Dict[str, Any]]:
+        """
+        Call Mineru API to convert DOCX to markdown
+        
+        Args:
+            file_path: Path to the DOCX file
+            
+        Returns:
+            API response as dictionary or None if failed
+        """
+        try:
+            url = f"{self.mineru_base_url}/file_parse"
+            
+            with open(file_path, 'rb') as file:
+                files = {'files': (os.path.basename(file_path), file, 'application/vnd.openxmlformats-officedocument.wordprocessingml.document')}
+                
+                # Prepare form data according to Mineru API specification
+                data = {
+                    'output_dir': './output',
+                    'lang_list': self.mineru_lang_list,
+                    'backend': self.mineru_backend,
+                    'parse_method': self.mineru_parse_method,
+                    'formula_enable': self.mineru_formula_enable,
+                    'table_enable': self.mineru_table_enable,
+                    'return_md': True,
+                    'return_middle_json': False,
+                    'return_model_output': False,
+                    'return_content_list': False,
+                    'return_images': False,
+                    'start_page_id': 0,
+                    'end_page_id': 99999
+                }
+                
+                logger.info(f"Calling Mineru API for DOCX processing at {url}")
+                response = requests.post(
+                    url, 
+                    files=files,
+                    data=data,
+                    timeout=self.mineru_timeout
+                )
+                
+                if response.status_code == 200:
+                    result = response.json()
+                    logger.info("Successfully received response from Mineru API for DOCX")
+                    return result
+                else:
+                    error_msg = f"Mineru API returned status code {response.status_code}: {response.text}"
+                    logger.error(error_msg)
+                    # For 400 errors, include more specific information
+                    if response.status_code == 400:
+                        try:
+                            error_data = response.json()
+                            if 'error' in error_data:
+                                error_msg = f"Mineru API error: {error_data['error']}"
+                        except:
+                            pass
+                    raise Exception(error_msg)
+                    
+        except requests.exceptions.Timeout:
+            error_msg = f"Mineru API request timed out after {self.mineru_timeout} seconds"
+            logger.error(error_msg)
+            raise Exception(error_msg)
+        except requests.exceptions.RequestException as e:
+            error_msg = f"Error calling Mineru API for DOCX: {str(e)}"
+            logger.error(error_msg)
+            raise Exception(error_msg)
+        except Exception as e:
+            error_msg = f"Unexpected error calling Mineru API for DOCX: {str(e)}"
+            logger.error(error_msg)
+            raise Exception(error_msg)
+
+    def _extract_markdown_from_response(self, response: Dict[str, Any]) -> str:
+        """
+        Extract markdown content from Mineru API response
+        
+        Args:
+            response: Mineru API response dictionary
+            
+        Returns:
+            Extracted markdown content as string
+        """
+        try:
+            logger.debug(f"Mineru API response structure for DOCX: {response}")
+            
+            # Try different possible response formats based on Mineru API
+            if 'markdown' in response:
+                return response['markdown']
+            elif 'md' in response:
+                return response['md']
+            elif 'content' in response:
+                return response['content']
+            elif 'text' in response:
+                return response['text']
+            elif 'result' in response and isinstance(response['result'], dict):
+                result = response['result']
+                if 'markdown' in result:
+                    return result['markdown']
+                elif 'md' in result:
+                    return result['md']
+                elif 'content' in result:
+                    return result['content']
+                elif 'text' in result:
+                    return result['text']
+            elif 'data' in response and isinstance(response['data'], dict):
+                data = response['data']
+                if 'markdown' in data:
+                    return data['markdown']
+                elif 'md' in data:
+                    return data['md']
+                elif 'content' in data:
+                    return data['content']
+                elif 'text' in data:
+                    return data['text']
+            elif isinstance(response, list) and len(response) > 0:
+                # If response is a list, try to extract from first item
+                first_item = response[0]
+                if isinstance(first_item, dict):
+                    return self._extract_markdown_from_response(first_item)
+                elif isinstance(first_item, str):
+                    return first_item
+            else:
+                # If no standard format found, try to extract from the response structure
+                logger.warning("Could not find standard markdown field in Mineru response for DOCX")
+                
+                # Return the response as string if it's simple, or empty string
+                if isinstance(response, str):
+                    return response
+                elif isinstance(response, dict):
+                    # Try to find any text-like content
+                    for key, value in response.items():
+                        if isinstance(value, str) and len(value) > 100:  # Likely content
+                            return value
+                        elif isinstance(value, dict):
+                            # Recursively search in nested dictionaries
+                            nested_content = self._extract_markdown_from_response(value)
+                            if nested_content:
+                                return nested_content
+                
+                return ""
+                
+        except Exception as e:
+            logger.error(f"Error extracting markdown from Mineru response for DOCX: {str(e)}")
+            return ""
+
+    def read_content(self) -> str:
+        logger.info("Starting DOCX content processing with Mineru API")
+        
+        # Call Mineru API to convert DOCX to markdown
+        # This will raise an exception if the API call fails
+        mineru_response = self._call_mineru_api(self.input_path)
+        
+        # Extract markdown content from the response
+        markdown_content = self._extract_markdown_from_response(mineru_response)
+        
+        if not markdown_content:
+            raise Exception("No markdown content found in Mineru API response for DOCX")
+        
+        logger.info(f"Successfully extracted {len(markdown_content)} characters of markdown content from DOCX")
+        
+        # Save the raw markdown content to work directory for reference
+        md_output_path = os.path.join(self.work_dir, f"{self.name_without_suff}.md")
+        with open(md_output_path, 'w', encoding='utf-8') as file:
+            file.write(markdown_content)
+        
+        logger.info(f"Saved raw markdown content from DOCX to {md_output_path}")
+        
+        return markdown_content
+
+    def save_content(self, content: str) -> None:
+        # Ensure output path has .md extension
+        output_dir = os.path.dirname(self.output_path)
+        base_name = os.path.splitext(os.path.basename(self.output_path))[0]
+        md_output_path = os.path.join(output_dir, f"{base_name}.md")
+        
+        logger.info(f"Saving masked DOCX content to: {md_output_path}")
+        try:
+            with open(md_output_path, 'w', encoding='utf-8') as file:
+                file.write(content)
+            logger.info(f"Successfully saved masked DOCX content to {md_output_path}")
+        except Exception as e:
+            logger.error(f"Error saving masked DOCX content: {e}")
+            raise
\ No newline at end of file
diff --git a/backend/app/core/document_handlers/processors/docx_processor.py.backup b/backend/app/core/document_handlers/processors/docx_processor.py.backup
deleted file mode 100644
index 598ba09..0000000
--- a/backend/app/core/document_handlers/processors/docx_processor.py.backup
+++ /dev/null
@@ -1,77 +0,0 @@
-import os
-import docx
-from ...document_handlers.document_processor import DocumentProcessor
-from magic_pdf.data.data_reader_writer import FileBasedDataWriter
-from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
-from magic_pdf.data.read_api import read_local_office
-import logging
-from ...services.ollama_client import OllamaClient
-from ...config import settings
-from ...prompts.masking_prompts import get_masking_mapping_prompt
-
-logger = logging.getLogger(__name__)
-
-class DocxDocumentProcessor(DocumentProcessor):
-    def __init__(self, input_path: str, output_path: str):
-        super().__init__()  # Call parent class's __init__
-        self.input_path = input_path
-        self.output_path = output_path
-        self.output_dir = os.path.dirname(output_path)
-        self.name_without_suff = os.path.splitext(os.path.basename(input_path))[0]
-        
-        # Setup output directories
-        self.local_image_dir = os.path.join(self.output_dir, "images")
-        self.image_dir = os.path.basename(self.local_image_dir)
-        os.makedirs(self.local_image_dir, exist_ok=True)
-        
-        self.ollama_client = OllamaClient(model_name=settings.OLLAMA_MODEL, base_url=settings.OLLAMA_API_URL)
-
-    def read_content(self) -> str:
-        try:
-            # Initialize writers
-            image_writer = FileBasedDataWriter(self.local_image_dir)
-            md_writer = FileBasedDataWriter(self.output_dir)
-            
-            # Create Dataset Instance and process
-            ds = read_local_office(self.input_path)[0]
-            pipe_result = ds.apply(doc_analyze, ocr=True).pipe_txt_mode(image_writer)
-            
-            # Generate markdown
-            md_content = pipe_result.get_markdown(self.image_dir)
-            pipe_result.dump_md(md_writer, f"{self.name_without_suff}.md", self.image_dir)
-            
-            return md_content
-        except Exception as e:
-            logger.error(f"Error converting DOCX to MD: {e}")
-            raise
-
-    # def process_content(self, content: str) -> str:
-    #     logger.info("Processing DOCX content")
-        
-    #     # Split content into sentences and apply masking
-    #     sentences = content.split("。")
-    #     final_md = ""
-    #     for sentence in sentences:
-    #         if sentence.strip():  # Only process non-empty sentences
-    #             formatted_prompt = get_masking_mapping_prompt(sentence)
-    #             logger.info("Calling ollama to generate response, prompt: %s", formatted_prompt)
-    #             response = self.ollama_client.generate(formatted_prompt)
-    #             logger.info(f"Response generated: {response}")
-    #             final_md += response + "。"
-        
-    #     return final_md
-
-    def save_content(self, content: str) -> None:
-        # Ensure output path has .md extension
-        output_dir = os.path.dirname(self.output_path)
-        base_name = os.path.splitext(os.path.basename(self.output_path))[0]
-        md_output_path = os.path.join(output_dir, f"{base_name}.md")
-        
-        logger.info(f"Saving masked content to: {md_output_path}")
-        try:
-            with open(md_output_path, 'w', encoding='utf-8') as file:
-                file.write(content)
-            logger.info(f"Successfully saved content to {md_output_path}")
-        except Exception as e:
-            logger.error(f"Error saving content: {e}")
-            raise
\ No newline at end of file
diff --git a/backend/app/core/document_handlers/processors/pdf_processor.py b/backend/app/core/document_handlers/processors/pdf_processor.py
index 99737a1..6409c3a 100644
--- a/backend/app/core/document_handlers/processors/pdf_processor.py
+++ b/backend/app/core/document_handlers/processors/pdf_processor.py
@@ -81,18 +81,30 @@ class PdfDocumentProcessor(DocumentProcessor):
                     logger.info("Successfully received response from Mineru API")
                     return result
                 else:
-                    logger.error(f"Mineru API returned status code {response.status_code}: {response.text}")
-                    return None
+                    error_msg = f"Mineru API returned status code {response.status_code}: {response.text}"
+                    logger.error(error_msg)
+                    # For 400 errors, include more specific information
+                    if response.status_code == 400:
+                        try:
+                            error_data = response.json()
+                            if 'error' in error_data:
+                                error_msg = f"Mineru API error: {error_data['error']}"
+                        except:
+                            pass
+                    raise Exception(error_msg)
                     
         except requests.exceptions.Timeout:
-            logger.error(f"Mineru API request timed out after {self.mineru_timeout} seconds")
-            return None
+            error_msg = f"Mineru API request timed out after {self.mineru_timeout} seconds"
+            logger.error(error_msg)
+            raise Exception(error_msg)
         except requests.exceptions.RequestException as e:
-            logger.error(f"Error calling Mineru API: {str(e)}")
-            return None
+            error_msg = f"Error calling Mineru API: {str(e)}"
+            logger.error(error_msg)
+            raise Exception(error_msg)
         except Exception as e:
-            logger.error(f"Unexpected error calling Mineru API: {str(e)}")
-            return None
+            error_msg = f"Unexpected error calling Mineru API: {str(e)}"
+            logger.error(error_msg)
+            raise Exception(error_msg)
 
     def _extract_markdown_from_response(self, response: Dict[str, Any]) -> str:
         """
@@ -171,11 +183,9 @@ class PdfDocumentProcessor(DocumentProcessor):
         logger.info("Starting PDF content processing with Mineru API")
         
         # Call Mineru API to convert PDF to markdown
+        # This will raise an exception if the API call fails
         mineru_response = self._call_mineru_api(self.input_path)
         
-        if not mineru_response:
-            raise Exception("Failed to get response from Mineru API")
-        
         # Extract markdown content from the response
         markdown_content = self._extract_markdown_from_response(mineru_response)
         
diff --git a/backend/app/core/services/document_service.py b/backend/app/core/services/document_service.py
index c169bfa..8f9c187 100644
--- a/backend/app/core/services/document_service.py
+++ b/backend/app/core/services/document_service.py
@@ -13,7 +13,7 @@ class DocumentService:
             processor = DocumentProcessorFactory.create_processor(input_path, output_path)
             if not processor:
                 logger.error(f"Unsupported file format: {input_path}")
-                return False
+                raise Exception(f"Unsupported file format: {input_path}")
 
             # Read content
             content = processor.read_content()
@@ -27,4 +27,5 @@ class DocumentService:
 
         except Exception as e:
             logger.error(f"Error processing document {input_path}: {str(e)}")
-            return False
\ No newline at end of file
+            # Re-raise the exception so the Celery task can handle it properly
+            raise
\ No newline at end of file
diff --git a/backend/app/services/file_service.py b/backend/app/services/file_service.py
index a08c7b2..9ac38cc 100644
--- a/backend/app/services/file_service.py
+++ b/backend/app/services/file_service.py
@@ -70,6 +70,7 @@ def process_file(file_id: str):
             output_path = str(settings.PROCESSED_FOLDER / output_filename)
             
             # Process document with both input and output paths
+            # This will raise an exception if processing fails
             process_service.process_document(file.original_path, output_path)
             
             # Update file record with processed path
@@ -81,6 +82,7 @@ def process_file(file_id: str):
             file.status = FileStatus.FAILED
             file.error_message = str(e)
             db.commit()
+            # Re-raise the exception to ensure Celery marks the task as failed
             raise
             
     finally:
diff --git a/docker-compose.yml b/docker-compose.yml
index 260af55..b450a60 100644
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -34,7 +34,6 @@ services:
       - "8000:8000"
     volumes:
       - ./backend/storage:/app/storage
-      - ./backend/legal_doc_masker.db:/app/legal_doc_masker.db
     env_file:
       - ./backend/.env
     environment:
@@ -55,7 +54,6 @@ services:
     command: celery -A app.services.file_service worker --loglevel=info
     volumes:
       - ./backend/storage:/app/storage
-      - ./backend/legal_doc_masker.db:/app/legal_doc_masker.db
     env_file:
       - ./backend/.env
     environment: