From 256e263cff61a87218948790761a6724693166e9 Mon Sep 17 00:00:00 2001 From: tigermren Date: Sun, 17 Aug 2025 23:12:45 +0800 Subject: [PATCH] =?UTF-8?q?feat:=20=E5=BC=80=E5=90=AFdocx=E8=A7=A3?= =?UTF-8?q?=E6=9E=90=EF=BC=8C=E4=BD=86=E6=98=AFmineru-api=E6=9C=AA?= =?UTF-8?q?=E6=94=AF=E6=8C=81?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../document_handlers/document_factory.py | 6 +- .../document_handlers/processors/__init__.py | 5 +- .../processors/docx_processor.py | 219 ++++++++++++++++++ .../processors/docx_processor.py.backup | 77 ------ .../processors/pdf_processor.py | 32 ++- backend/app/core/services/document_service.py | 5 +- backend/app/services/file_service.py | 2 + docker-compose.yml | 2 - 8 files changed, 250 insertions(+), 98 deletions(-) create mode 100644 backend/app/core/document_handlers/processors/docx_processor.py delete mode 100644 backend/app/core/document_handlers/processors/docx_processor.py.backup diff --git a/backend/app/core/document_handlers/document_factory.py b/backend/app/core/document_handlers/document_factory.py index 530f536..cb2a73f 100644 --- a/backend/app/core/document_handlers/document_factory.py +++ b/backend/app/core/document_handlers/document_factory.py @@ -3,7 +3,7 @@ from typing import Optional from .document_processor import DocumentProcessor from .processors import ( TxtDocumentProcessor, - # DocxDocumentProcessor, + DocxDocumentProcessor, PdfDocumentProcessor, MarkdownDocumentProcessor ) @@ -15,8 +15,8 @@ class DocumentProcessorFactory: processors = { '.txt': TxtDocumentProcessor, - # '.docx': DocxDocumentProcessor, - # '.doc': DocxDocumentProcessor, + '.docx': DocxDocumentProcessor, + '.doc': DocxDocumentProcessor, '.pdf': PdfDocumentProcessor, '.md': MarkdownDocumentProcessor, '.markdown': MarkdownDocumentProcessor diff --git a/backend/app/core/document_handlers/processors/__init__.py b/backend/app/core/document_handlers/processors/__init__.py index fd143d5..d8d35f0 100644 --- a/backend/app/core/document_handlers/processors/__init__.py +++ b/backend/app/core/document_handlers/processors/__init__.py @@ -1,7 +1,6 @@ from .txt_processor import TxtDocumentProcessor -# from .docx_processor import DocxDocumentProcessor +from .docx_processor import DocxDocumentProcessor from .pdf_processor import PdfDocumentProcessor from .md_processor import MarkdownDocumentProcessor -# __all__ = ['TxtDocumentProcessor', 'DocxDocumentProcessor', 'PdfDocumentProcessor', 'MarkdownDocumentProcessor'] -__all__ = ['TxtDocumentProcessor', 'PdfDocumentProcessor', 'MarkdownDocumentProcessor'] \ No newline at end of file +__all__ = ['TxtDocumentProcessor', 'DocxDocumentProcessor', 'PdfDocumentProcessor', 'MarkdownDocumentProcessor'] \ No newline at end of file diff --git a/backend/app/core/document_handlers/processors/docx_processor.py b/backend/app/core/document_handlers/processors/docx_processor.py new file mode 100644 index 0000000..c451834 --- /dev/null +++ b/backend/app/core/document_handlers/processors/docx_processor.py @@ -0,0 +1,219 @@ +import os +import requests +import logging +from typing import Dict, Any, Optional +from ...document_handlers.document_processor import DocumentProcessor +from ...services.ollama_client import OllamaClient +from ...config import settings + +logger = logging.getLogger(__name__) + +class DocxDocumentProcessor(DocumentProcessor): + def __init__(self, input_path: str, output_path: str): + super().__init__() # Call parent class's __init__ + self.input_path = input_path + self.output_path = output_path + self.output_dir = os.path.dirname(output_path) + self.name_without_suff = os.path.splitext(os.path.basename(input_path))[0] + + # Setup work directory for temporary files + self.work_dir = os.path.join( + os.path.dirname(output_path), + ".work", + os.path.splitext(os.path.basename(input_path))[0] + ) + os.makedirs(self.work_dir, exist_ok=True) + + self.ollama_client = OllamaClient(model_name=settings.OLLAMA_MODEL, base_url=settings.OLLAMA_API_URL) + + # Mineru API configuration + self.mineru_base_url = getattr(settings, 'MINERU_API_URL', 'http://mineru-api:8000') + self.mineru_timeout = getattr(settings, 'MINERU_TIMEOUT', 300) # 5 minutes timeout + self.mineru_lang_list = getattr(settings, 'MINERU_LANG_LIST', ['ch']) + self.mineru_backend = getattr(settings, 'MINERU_BACKEND', 'pipeline') + self.mineru_parse_method = getattr(settings, 'MINERU_PARSE_METHOD', 'auto') + self.mineru_formula_enable = getattr(settings, 'MINERU_FORMULA_ENABLE', True) + self.mineru_table_enable = getattr(settings, 'MINERU_TABLE_ENABLE', True) + + def _call_mineru_api(self, file_path: str) -> Optional[Dict[str, Any]]: + """ + Call Mineru API to convert DOCX to markdown + + Args: + file_path: Path to the DOCX file + + Returns: + API response as dictionary or None if failed + """ + try: + url = f"{self.mineru_base_url}/file_parse" + + with open(file_path, 'rb') as file: + files = {'files': (os.path.basename(file_path), file, 'application/vnd.openxmlformats-officedocument.wordprocessingml.document')} + + # Prepare form data according to Mineru API specification + data = { + 'output_dir': './output', + 'lang_list': self.mineru_lang_list, + 'backend': self.mineru_backend, + 'parse_method': self.mineru_parse_method, + 'formula_enable': self.mineru_formula_enable, + 'table_enable': self.mineru_table_enable, + 'return_md': True, + 'return_middle_json': False, + 'return_model_output': False, + 'return_content_list': False, + 'return_images': False, + 'start_page_id': 0, + 'end_page_id': 99999 + } + + logger.info(f"Calling Mineru API for DOCX processing at {url}") + response = requests.post( + url, + files=files, + data=data, + timeout=self.mineru_timeout + ) + + if response.status_code == 200: + result = response.json() + logger.info("Successfully received response from Mineru API for DOCX") + return result + else: + error_msg = f"Mineru API returned status code {response.status_code}: {response.text}" + logger.error(error_msg) + # For 400 errors, include more specific information + if response.status_code == 400: + try: + error_data = response.json() + if 'error' in error_data: + error_msg = f"Mineru API error: {error_data['error']}" + except: + pass + raise Exception(error_msg) + + except requests.exceptions.Timeout: + error_msg = f"Mineru API request timed out after {self.mineru_timeout} seconds" + logger.error(error_msg) + raise Exception(error_msg) + except requests.exceptions.RequestException as e: + error_msg = f"Error calling Mineru API for DOCX: {str(e)}" + logger.error(error_msg) + raise Exception(error_msg) + except Exception as e: + error_msg = f"Unexpected error calling Mineru API for DOCX: {str(e)}" + logger.error(error_msg) + raise Exception(error_msg) + + def _extract_markdown_from_response(self, response: Dict[str, Any]) -> str: + """ + Extract markdown content from Mineru API response + + Args: + response: Mineru API response dictionary + + Returns: + Extracted markdown content as string + """ + try: + logger.debug(f"Mineru API response structure for DOCX: {response}") + + # Try different possible response formats based on Mineru API + if 'markdown' in response: + return response['markdown'] + elif 'md' in response: + return response['md'] + elif 'content' in response: + return response['content'] + elif 'text' in response: + return response['text'] + elif 'result' in response and isinstance(response['result'], dict): + result = response['result'] + if 'markdown' in result: + return result['markdown'] + elif 'md' in result: + return result['md'] + elif 'content' in result: + return result['content'] + elif 'text' in result: + return result['text'] + elif 'data' in response and isinstance(response['data'], dict): + data = response['data'] + if 'markdown' in data: + return data['markdown'] + elif 'md' in data: + return data['md'] + elif 'content' in data: + return data['content'] + elif 'text' in data: + return data['text'] + elif isinstance(response, list) and len(response) > 0: + # If response is a list, try to extract from first item + first_item = response[0] + if isinstance(first_item, dict): + return self._extract_markdown_from_response(first_item) + elif isinstance(first_item, str): + return first_item + else: + # If no standard format found, try to extract from the response structure + logger.warning("Could not find standard markdown field in Mineru response for DOCX") + + # Return the response as string if it's simple, or empty string + if isinstance(response, str): + return response + elif isinstance(response, dict): + # Try to find any text-like content + for key, value in response.items(): + if isinstance(value, str) and len(value) > 100: # Likely content + return value + elif isinstance(value, dict): + # Recursively search in nested dictionaries + nested_content = self._extract_markdown_from_response(value) + if nested_content: + return nested_content + + return "" + + except Exception as e: + logger.error(f"Error extracting markdown from Mineru response for DOCX: {str(e)}") + return "" + + def read_content(self) -> str: + logger.info("Starting DOCX content processing with Mineru API") + + # Call Mineru API to convert DOCX to markdown + # This will raise an exception if the API call fails + mineru_response = self._call_mineru_api(self.input_path) + + # Extract markdown content from the response + markdown_content = self._extract_markdown_from_response(mineru_response) + + if not markdown_content: + raise Exception("No markdown content found in Mineru API response for DOCX") + + logger.info(f"Successfully extracted {len(markdown_content)} characters of markdown content from DOCX") + + # Save the raw markdown content to work directory for reference + md_output_path = os.path.join(self.work_dir, f"{self.name_without_suff}.md") + with open(md_output_path, 'w', encoding='utf-8') as file: + file.write(markdown_content) + + logger.info(f"Saved raw markdown content from DOCX to {md_output_path}") + + return markdown_content + + def save_content(self, content: str) -> None: + # Ensure output path has .md extension + output_dir = os.path.dirname(self.output_path) + base_name = os.path.splitext(os.path.basename(self.output_path))[0] + md_output_path = os.path.join(output_dir, f"{base_name}.md") + + logger.info(f"Saving masked DOCX content to: {md_output_path}") + try: + with open(md_output_path, 'w', encoding='utf-8') as file: + file.write(content) + logger.info(f"Successfully saved masked DOCX content to {md_output_path}") + except Exception as e: + logger.error(f"Error saving masked DOCX content: {e}") + raise \ No newline at end of file diff --git a/backend/app/core/document_handlers/processors/docx_processor.py.backup b/backend/app/core/document_handlers/processors/docx_processor.py.backup deleted file mode 100644 index 598ba09..0000000 --- a/backend/app/core/document_handlers/processors/docx_processor.py.backup +++ /dev/null @@ -1,77 +0,0 @@ -import os -import docx -from ...document_handlers.document_processor import DocumentProcessor -from magic_pdf.data.data_reader_writer import FileBasedDataWriter -from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze -from magic_pdf.data.read_api import read_local_office -import logging -from ...services.ollama_client import OllamaClient -from ...config import settings -from ...prompts.masking_prompts import get_masking_mapping_prompt - -logger = logging.getLogger(__name__) - -class DocxDocumentProcessor(DocumentProcessor): - def __init__(self, input_path: str, output_path: str): - super().__init__() # Call parent class's __init__ - self.input_path = input_path - self.output_path = output_path - self.output_dir = os.path.dirname(output_path) - self.name_without_suff = os.path.splitext(os.path.basename(input_path))[0] - - # Setup output directories - self.local_image_dir = os.path.join(self.output_dir, "images") - self.image_dir = os.path.basename(self.local_image_dir) - os.makedirs(self.local_image_dir, exist_ok=True) - - self.ollama_client = OllamaClient(model_name=settings.OLLAMA_MODEL, base_url=settings.OLLAMA_API_URL) - - def read_content(self) -> str: - try: - # Initialize writers - image_writer = FileBasedDataWriter(self.local_image_dir) - md_writer = FileBasedDataWriter(self.output_dir) - - # Create Dataset Instance and process - ds = read_local_office(self.input_path)[0] - pipe_result = ds.apply(doc_analyze, ocr=True).pipe_txt_mode(image_writer) - - # Generate markdown - md_content = pipe_result.get_markdown(self.image_dir) - pipe_result.dump_md(md_writer, f"{self.name_without_suff}.md", self.image_dir) - - return md_content - except Exception as e: - logger.error(f"Error converting DOCX to MD: {e}") - raise - - # def process_content(self, content: str) -> str: - # logger.info("Processing DOCX content") - - # # Split content into sentences and apply masking - # sentences = content.split("。") - # final_md = "" - # for sentence in sentences: - # if sentence.strip(): # Only process non-empty sentences - # formatted_prompt = get_masking_mapping_prompt(sentence) - # logger.info("Calling ollama to generate response, prompt: %s", formatted_prompt) - # response = self.ollama_client.generate(formatted_prompt) - # logger.info(f"Response generated: {response}") - # final_md += response + "。" - - # return final_md - - def save_content(self, content: str) -> None: - # Ensure output path has .md extension - output_dir = os.path.dirname(self.output_path) - base_name = os.path.splitext(os.path.basename(self.output_path))[0] - md_output_path = os.path.join(output_dir, f"{base_name}.md") - - logger.info(f"Saving masked content to: {md_output_path}") - try: - with open(md_output_path, 'w', encoding='utf-8') as file: - file.write(content) - logger.info(f"Successfully saved content to {md_output_path}") - except Exception as e: - logger.error(f"Error saving content: {e}") - raise \ No newline at end of file diff --git a/backend/app/core/document_handlers/processors/pdf_processor.py b/backend/app/core/document_handlers/processors/pdf_processor.py index 99737a1..6409c3a 100644 --- a/backend/app/core/document_handlers/processors/pdf_processor.py +++ b/backend/app/core/document_handlers/processors/pdf_processor.py @@ -81,18 +81,30 @@ class PdfDocumentProcessor(DocumentProcessor): logger.info("Successfully received response from Mineru API") return result else: - logger.error(f"Mineru API returned status code {response.status_code}: {response.text}") - return None + error_msg = f"Mineru API returned status code {response.status_code}: {response.text}" + logger.error(error_msg) + # For 400 errors, include more specific information + if response.status_code == 400: + try: + error_data = response.json() + if 'error' in error_data: + error_msg = f"Mineru API error: {error_data['error']}" + except: + pass + raise Exception(error_msg) except requests.exceptions.Timeout: - logger.error(f"Mineru API request timed out after {self.mineru_timeout} seconds") - return None + error_msg = f"Mineru API request timed out after {self.mineru_timeout} seconds" + logger.error(error_msg) + raise Exception(error_msg) except requests.exceptions.RequestException as e: - logger.error(f"Error calling Mineru API: {str(e)}") - return None + error_msg = f"Error calling Mineru API: {str(e)}" + logger.error(error_msg) + raise Exception(error_msg) except Exception as e: - logger.error(f"Unexpected error calling Mineru API: {str(e)}") - return None + error_msg = f"Unexpected error calling Mineru API: {str(e)}" + logger.error(error_msg) + raise Exception(error_msg) def _extract_markdown_from_response(self, response: Dict[str, Any]) -> str: """ @@ -171,11 +183,9 @@ class PdfDocumentProcessor(DocumentProcessor): logger.info("Starting PDF content processing with Mineru API") # Call Mineru API to convert PDF to markdown + # This will raise an exception if the API call fails mineru_response = self._call_mineru_api(self.input_path) - if not mineru_response: - raise Exception("Failed to get response from Mineru API") - # Extract markdown content from the response markdown_content = self._extract_markdown_from_response(mineru_response) diff --git a/backend/app/core/services/document_service.py b/backend/app/core/services/document_service.py index c169bfa..8f9c187 100644 --- a/backend/app/core/services/document_service.py +++ b/backend/app/core/services/document_service.py @@ -13,7 +13,7 @@ class DocumentService: processor = DocumentProcessorFactory.create_processor(input_path, output_path) if not processor: logger.error(f"Unsupported file format: {input_path}") - return False + raise Exception(f"Unsupported file format: {input_path}") # Read content content = processor.read_content() @@ -27,4 +27,5 @@ class DocumentService: except Exception as e: logger.error(f"Error processing document {input_path}: {str(e)}") - return False \ No newline at end of file + # Re-raise the exception so the Celery task can handle it properly + raise \ No newline at end of file diff --git a/backend/app/services/file_service.py b/backend/app/services/file_service.py index a08c7b2..9ac38cc 100644 --- a/backend/app/services/file_service.py +++ b/backend/app/services/file_service.py @@ -70,6 +70,7 @@ def process_file(file_id: str): output_path = str(settings.PROCESSED_FOLDER / output_filename) # Process document with both input and output paths + # This will raise an exception if processing fails process_service.process_document(file.original_path, output_path) # Update file record with processed path @@ -81,6 +82,7 @@ def process_file(file_id: str): file.status = FileStatus.FAILED file.error_message = str(e) db.commit() + # Re-raise the exception to ensure Celery marks the task as failed raise finally: diff --git a/docker-compose.yml b/docker-compose.yml index 260af55..b450a60 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -34,7 +34,6 @@ services: - "8000:8000" volumes: - ./backend/storage:/app/storage - - ./backend/legal_doc_masker.db:/app/legal_doc_masker.db env_file: - ./backend/.env environment: @@ -55,7 +54,6 @@ services: command: celery -A app.services.file_service worker --loglevel=info volumes: - ./backend/storage:/app/storage - - ./backend/legal_doc_masker.db:/app/legal_doc_masker.db env_file: - ./backend/.env environment: