feat: 开启docx解析,但是mineru-api未支持
This commit is contained in:
parent
1138683da1
commit
256e263cff
|
|
@ -3,7 +3,7 @@ from typing import Optional
|
||||||
from .document_processor import DocumentProcessor
|
from .document_processor import DocumentProcessor
|
||||||
from .processors import (
|
from .processors import (
|
||||||
TxtDocumentProcessor,
|
TxtDocumentProcessor,
|
||||||
# DocxDocumentProcessor,
|
DocxDocumentProcessor,
|
||||||
PdfDocumentProcessor,
|
PdfDocumentProcessor,
|
||||||
MarkdownDocumentProcessor
|
MarkdownDocumentProcessor
|
||||||
)
|
)
|
||||||
|
|
@ -15,8 +15,8 @@ class DocumentProcessorFactory:
|
||||||
|
|
||||||
processors = {
|
processors = {
|
||||||
'.txt': TxtDocumentProcessor,
|
'.txt': TxtDocumentProcessor,
|
||||||
# '.docx': DocxDocumentProcessor,
|
'.docx': DocxDocumentProcessor,
|
||||||
# '.doc': DocxDocumentProcessor,
|
'.doc': DocxDocumentProcessor,
|
||||||
'.pdf': PdfDocumentProcessor,
|
'.pdf': PdfDocumentProcessor,
|
||||||
'.md': MarkdownDocumentProcessor,
|
'.md': MarkdownDocumentProcessor,
|
||||||
'.markdown': MarkdownDocumentProcessor
|
'.markdown': MarkdownDocumentProcessor
|
||||||
|
|
|
||||||
|
|
@ -1,7 +1,6 @@
|
||||||
from .txt_processor import TxtDocumentProcessor
|
from .txt_processor import TxtDocumentProcessor
|
||||||
# from .docx_processor import DocxDocumentProcessor
|
from .docx_processor import DocxDocumentProcessor
|
||||||
from .pdf_processor import PdfDocumentProcessor
|
from .pdf_processor import PdfDocumentProcessor
|
||||||
from .md_processor import MarkdownDocumentProcessor
|
from .md_processor import MarkdownDocumentProcessor
|
||||||
|
|
||||||
# __all__ = ['TxtDocumentProcessor', 'DocxDocumentProcessor', 'PdfDocumentProcessor', 'MarkdownDocumentProcessor']
|
__all__ = ['TxtDocumentProcessor', 'DocxDocumentProcessor', 'PdfDocumentProcessor', 'MarkdownDocumentProcessor']
|
||||||
__all__ = ['TxtDocumentProcessor', 'PdfDocumentProcessor', 'MarkdownDocumentProcessor']
|
|
||||||
|
|
@ -0,0 +1,219 @@
|
||||||
|
import os
|
||||||
|
import requests
|
||||||
|
import logging
|
||||||
|
from typing import Dict, Any, Optional
|
||||||
|
from ...document_handlers.document_processor import DocumentProcessor
|
||||||
|
from ...services.ollama_client import OllamaClient
|
||||||
|
from ...config import settings
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
class DocxDocumentProcessor(DocumentProcessor):
|
||||||
|
def __init__(self, input_path: str, output_path: str):
|
||||||
|
super().__init__() # Call parent class's __init__
|
||||||
|
self.input_path = input_path
|
||||||
|
self.output_path = output_path
|
||||||
|
self.output_dir = os.path.dirname(output_path)
|
||||||
|
self.name_without_suff = os.path.splitext(os.path.basename(input_path))[0]
|
||||||
|
|
||||||
|
# Setup work directory for temporary files
|
||||||
|
self.work_dir = os.path.join(
|
||||||
|
os.path.dirname(output_path),
|
||||||
|
".work",
|
||||||
|
os.path.splitext(os.path.basename(input_path))[0]
|
||||||
|
)
|
||||||
|
os.makedirs(self.work_dir, exist_ok=True)
|
||||||
|
|
||||||
|
self.ollama_client = OllamaClient(model_name=settings.OLLAMA_MODEL, base_url=settings.OLLAMA_API_URL)
|
||||||
|
|
||||||
|
# Mineru API configuration
|
||||||
|
self.mineru_base_url = getattr(settings, 'MINERU_API_URL', 'http://mineru-api:8000')
|
||||||
|
self.mineru_timeout = getattr(settings, 'MINERU_TIMEOUT', 300) # 5 minutes timeout
|
||||||
|
self.mineru_lang_list = getattr(settings, 'MINERU_LANG_LIST', ['ch'])
|
||||||
|
self.mineru_backend = getattr(settings, 'MINERU_BACKEND', 'pipeline')
|
||||||
|
self.mineru_parse_method = getattr(settings, 'MINERU_PARSE_METHOD', 'auto')
|
||||||
|
self.mineru_formula_enable = getattr(settings, 'MINERU_FORMULA_ENABLE', True)
|
||||||
|
self.mineru_table_enable = getattr(settings, 'MINERU_TABLE_ENABLE', True)
|
||||||
|
|
||||||
|
def _call_mineru_api(self, file_path: str) -> Optional[Dict[str, Any]]:
|
||||||
|
"""
|
||||||
|
Call Mineru API to convert DOCX to markdown
|
||||||
|
|
||||||
|
Args:
|
||||||
|
file_path: Path to the DOCX file
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
API response as dictionary or None if failed
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
url = f"{self.mineru_base_url}/file_parse"
|
||||||
|
|
||||||
|
with open(file_path, 'rb') as file:
|
||||||
|
files = {'files': (os.path.basename(file_path), file, 'application/vnd.openxmlformats-officedocument.wordprocessingml.document')}
|
||||||
|
|
||||||
|
# Prepare form data according to Mineru API specification
|
||||||
|
data = {
|
||||||
|
'output_dir': './output',
|
||||||
|
'lang_list': self.mineru_lang_list,
|
||||||
|
'backend': self.mineru_backend,
|
||||||
|
'parse_method': self.mineru_parse_method,
|
||||||
|
'formula_enable': self.mineru_formula_enable,
|
||||||
|
'table_enable': self.mineru_table_enable,
|
||||||
|
'return_md': True,
|
||||||
|
'return_middle_json': False,
|
||||||
|
'return_model_output': False,
|
||||||
|
'return_content_list': False,
|
||||||
|
'return_images': False,
|
||||||
|
'start_page_id': 0,
|
||||||
|
'end_page_id': 99999
|
||||||
|
}
|
||||||
|
|
||||||
|
logger.info(f"Calling Mineru API for DOCX processing at {url}")
|
||||||
|
response = requests.post(
|
||||||
|
url,
|
||||||
|
files=files,
|
||||||
|
data=data,
|
||||||
|
timeout=self.mineru_timeout
|
||||||
|
)
|
||||||
|
|
||||||
|
if response.status_code == 200:
|
||||||
|
result = response.json()
|
||||||
|
logger.info("Successfully received response from Mineru API for DOCX")
|
||||||
|
return result
|
||||||
|
else:
|
||||||
|
error_msg = f"Mineru API returned status code {response.status_code}: {response.text}"
|
||||||
|
logger.error(error_msg)
|
||||||
|
# For 400 errors, include more specific information
|
||||||
|
if response.status_code == 400:
|
||||||
|
try:
|
||||||
|
error_data = response.json()
|
||||||
|
if 'error' in error_data:
|
||||||
|
error_msg = f"Mineru API error: {error_data['error']}"
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
raise Exception(error_msg)
|
||||||
|
|
||||||
|
except requests.exceptions.Timeout:
|
||||||
|
error_msg = f"Mineru API request timed out after {self.mineru_timeout} seconds"
|
||||||
|
logger.error(error_msg)
|
||||||
|
raise Exception(error_msg)
|
||||||
|
except requests.exceptions.RequestException as e:
|
||||||
|
error_msg = f"Error calling Mineru API for DOCX: {str(e)}"
|
||||||
|
logger.error(error_msg)
|
||||||
|
raise Exception(error_msg)
|
||||||
|
except Exception as e:
|
||||||
|
error_msg = f"Unexpected error calling Mineru API for DOCX: {str(e)}"
|
||||||
|
logger.error(error_msg)
|
||||||
|
raise Exception(error_msg)
|
||||||
|
|
||||||
|
def _extract_markdown_from_response(self, response: Dict[str, Any]) -> str:
|
||||||
|
"""
|
||||||
|
Extract markdown content from Mineru API response
|
||||||
|
|
||||||
|
Args:
|
||||||
|
response: Mineru API response dictionary
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Extracted markdown content as string
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
logger.debug(f"Mineru API response structure for DOCX: {response}")
|
||||||
|
|
||||||
|
# Try different possible response formats based on Mineru API
|
||||||
|
if 'markdown' in response:
|
||||||
|
return response['markdown']
|
||||||
|
elif 'md' in response:
|
||||||
|
return response['md']
|
||||||
|
elif 'content' in response:
|
||||||
|
return response['content']
|
||||||
|
elif 'text' in response:
|
||||||
|
return response['text']
|
||||||
|
elif 'result' in response and isinstance(response['result'], dict):
|
||||||
|
result = response['result']
|
||||||
|
if 'markdown' in result:
|
||||||
|
return result['markdown']
|
||||||
|
elif 'md' in result:
|
||||||
|
return result['md']
|
||||||
|
elif 'content' in result:
|
||||||
|
return result['content']
|
||||||
|
elif 'text' in result:
|
||||||
|
return result['text']
|
||||||
|
elif 'data' in response and isinstance(response['data'], dict):
|
||||||
|
data = response['data']
|
||||||
|
if 'markdown' in data:
|
||||||
|
return data['markdown']
|
||||||
|
elif 'md' in data:
|
||||||
|
return data['md']
|
||||||
|
elif 'content' in data:
|
||||||
|
return data['content']
|
||||||
|
elif 'text' in data:
|
||||||
|
return data['text']
|
||||||
|
elif isinstance(response, list) and len(response) > 0:
|
||||||
|
# If response is a list, try to extract from first item
|
||||||
|
first_item = response[0]
|
||||||
|
if isinstance(first_item, dict):
|
||||||
|
return self._extract_markdown_from_response(first_item)
|
||||||
|
elif isinstance(first_item, str):
|
||||||
|
return first_item
|
||||||
|
else:
|
||||||
|
# If no standard format found, try to extract from the response structure
|
||||||
|
logger.warning("Could not find standard markdown field in Mineru response for DOCX")
|
||||||
|
|
||||||
|
# Return the response as string if it's simple, or empty string
|
||||||
|
if isinstance(response, str):
|
||||||
|
return response
|
||||||
|
elif isinstance(response, dict):
|
||||||
|
# Try to find any text-like content
|
||||||
|
for key, value in response.items():
|
||||||
|
if isinstance(value, str) and len(value) > 100: # Likely content
|
||||||
|
return value
|
||||||
|
elif isinstance(value, dict):
|
||||||
|
# Recursively search in nested dictionaries
|
||||||
|
nested_content = self._extract_markdown_from_response(value)
|
||||||
|
if nested_content:
|
||||||
|
return nested_content
|
||||||
|
|
||||||
|
return ""
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Error extracting markdown from Mineru response for DOCX: {str(e)}")
|
||||||
|
return ""
|
||||||
|
|
||||||
|
def read_content(self) -> str:
|
||||||
|
logger.info("Starting DOCX content processing with Mineru API")
|
||||||
|
|
||||||
|
# Call Mineru API to convert DOCX to markdown
|
||||||
|
# This will raise an exception if the API call fails
|
||||||
|
mineru_response = self._call_mineru_api(self.input_path)
|
||||||
|
|
||||||
|
# Extract markdown content from the response
|
||||||
|
markdown_content = self._extract_markdown_from_response(mineru_response)
|
||||||
|
|
||||||
|
if not markdown_content:
|
||||||
|
raise Exception("No markdown content found in Mineru API response for DOCX")
|
||||||
|
|
||||||
|
logger.info(f"Successfully extracted {len(markdown_content)} characters of markdown content from DOCX")
|
||||||
|
|
||||||
|
# Save the raw markdown content to work directory for reference
|
||||||
|
md_output_path = os.path.join(self.work_dir, f"{self.name_without_suff}.md")
|
||||||
|
with open(md_output_path, 'w', encoding='utf-8') as file:
|
||||||
|
file.write(markdown_content)
|
||||||
|
|
||||||
|
logger.info(f"Saved raw markdown content from DOCX to {md_output_path}")
|
||||||
|
|
||||||
|
return markdown_content
|
||||||
|
|
||||||
|
def save_content(self, content: str) -> None:
|
||||||
|
# Ensure output path has .md extension
|
||||||
|
output_dir = os.path.dirname(self.output_path)
|
||||||
|
base_name = os.path.splitext(os.path.basename(self.output_path))[0]
|
||||||
|
md_output_path = os.path.join(output_dir, f"{base_name}.md")
|
||||||
|
|
||||||
|
logger.info(f"Saving masked DOCX content to: {md_output_path}")
|
||||||
|
try:
|
||||||
|
with open(md_output_path, 'w', encoding='utf-8') as file:
|
||||||
|
file.write(content)
|
||||||
|
logger.info(f"Successfully saved masked DOCX content to {md_output_path}")
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Error saving masked DOCX content: {e}")
|
||||||
|
raise
|
||||||
|
|
@ -1,77 +0,0 @@
|
||||||
import os
|
|
||||||
import docx
|
|
||||||
from ...document_handlers.document_processor import DocumentProcessor
|
|
||||||
from magic_pdf.data.data_reader_writer import FileBasedDataWriter
|
|
||||||
from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
|
|
||||||
from magic_pdf.data.read_api import read_local_office
|
|
||||||
import logging
|
|
||||||
from ...services.ollama_client import OllamaClient
|
|
||||||
from ...config import settings
|
|
||||||
from ...prompts.masking_prompts import get_masking_mapping_prompt
|
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
|
||||||
|
|
||||||
class DocxDocumentProcessor(DocumentProcessor):
|
|
||||||
def __init__(self, input_path: str, output_path: str):
|
|
||||||
super().__init__() # Call parent class's __init__
|
|
||||||
self.input_path = input_path
|
|
||||||
self.output_path = output_path
|
|
||||||
self.output_dir = os.path.dirname(output_path)
|
|
||||||
self.name_without_suff = os.path.splitext(os.path.basename(input_path))[0]
|
|
||||||
|
|
||||||
# Setup output directories
|
|
||||||
self.local_image_dir = os.path.join(self.output_dir, "images")
|
|
||||||
self.image_dir = os.path.basename(self.local_image_dir)
|
|
||||||
os.makedirs(self.local_image_dir, exist_ok=True)
|
|
||||||
|
|
||||||
self.ollama_client = OllamaClient(model_name=settings.OLLAMA_MODEL, base_url=settings.OLLAMA_API_URL)
|
|
||||||
|
|
||||||
def read_content(self) -> str:
|
|
||||||
try:
|
|
||||||
# Initialize writers
|
|
||||||
image_writer = FileBasedDataWriter(self.local_image_dir)
|
|
||||||
md_writer = FileBasedDataWriter(self.output_dir)
|
|
||||||
|
|
||||||
# Create Dataset Instance and process
|
|
||||||
ds = read_local_office(self.input_path)[0]
|
|
||||||
pipe_result = ds.apply(doc_analyze, ocr=True).pipe_txt_mode(image_writer)
|
|
||||||
|
|
||||||
# Generate markdown
|
|
||||||
md_content = pipe_result.get_markdown(self.image_dir)
|
|
||||||
pipe_result.dump_md(md_writer, f"{self.name_without_suff}.md", self.image_dir)
|
|
||||||
|
|
||||||
return md_content
|
|
||||||
except Exception as e:
|
|
||||||
logger.error(f"Error converting DOCX to MD: {e}")
|
|
||||||
raise
|
|
||||||
|
|
||||||
# def process_content(self, content: str) -> str:
|
|
||||||
# logger.info("Processing DOCX content")
|
|
||||||
|
|
||||||
# # Split content into sentences and apply masking
|
|
||||||
# sentences = content.split("。")
|
|
||||||
# final_md = ""
|
|
||||||
# for sentence in sentences:
|
|
||||||
# if sentence.strip(): # Only process non-empty sentences
|
|
||||||
# formatted_prompt = get_masking_mapping_prompt(sentence)
|
|
||||||
# logger.info("Calling ollama to generate response, prompt: %s", formatted_prompt)
|
|
||||||
# response = self.ollama_client.generate(formatted_prompt)
|
|
||||||
# logger.info(f"Response generated: {response}")
|
|
||||||
# final_md += response + "。"
|
|
||||||
|
|
||||||
# return final_md
|
|
||||||
|
|
||||||
def save_content(self, content: str) -> None:
|
|
||||||
# Ensure output path has .md extension
|
|
||||||
output_dir = os.path.dirname(self.output_path)
|
|
||||||
base_name = os.path.splitext(os.path.basename(self.output_path))[0]
|
|
||||||
md_output_path = os.path.join(output_dir, f"{base_name}.md")
|
|
||||||
|
|
||||||
logger.info(f"Saving masked content to: {md_output_path}")
|
|
||||||
try:
|
|
||||||
with open(md_output_path, 'w', encoding='utf-8') as file:
|
|
||||||
file.write(content)
|
|
||||||
logger.info(f"Successfully saved content to {md_output_path}")
|
|
||||||
except Exception as e:
|
|
||||||
logger.error(f"Error saving content: {e}")
|
|
||||||
raise
|
|
||||||
|
|
@ -81,18 +81,30 @@ class PdfDocumentProcessor(DocumentProcessor):
|
||||||
logger.info("Successfully received response from Mineru API")
|
logger.info("Successfully received response from Mineru API")
|
||||||
return result
|
return result
|
||||||
else:
|
else:
|
||||||
logger.error(f"Mineru API returned status code {response.status_code}: {response.text}")
|
error_msg = f"Mineru API returned status code {response.status_code}: {response.text}"
|
||||||
return None
|
logger.error(error_msg)
|
||||||
|
# For 400 errors, include more specific information
|
||||||
|
if response.status_code == 400:
|
||||||
|
try:
|
||||||
|
error_data = response.json()
|
||||||
|
if 'error' in error_data:
|
||||||
|
error_msg = f"Mineru API error: {error_data['error']}"
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
raise Exception(error_msg)
|
||||||
|
|
||||||
except requests.exceptions.Timeout:
|
except requests.exceptions.Timeout:
|
||||||
logger.error(f"Mineru API request timed out after {self.mineru_timeout} seconds")
|
error_msg = f"Mineru API request timed out after {self.mineru_timeout} seconds"
|
||||||
return None
|
logger.error(error_msg)
|
||||||
|
raise Exception(error_msg)
|
||||||
except requests.exceptions.RequestException as e:
|
except requests.exceptions.RequestException as e:
|
||||||
logger.error(f"Error calling Mineru API: {str(e)}")
|
error_msg = f"Error calling Mineru API: {str(e)}"
|
||||||
return None
|
logger.error(error_msg)
|
||||||
|
raise Exception(error_msg)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(f"Unexpected error calling Mineru API: {str(e)}")
|
error_msg = f"Unexpected error calling Mineru API: {str(e)}"
|
||||||
return None
|
logger.error(error_msg)
|
||||||
|
raise Exception(error_msg)
|
||||||
|
|
||||||
def _extract_markdown_from_response(self, response: Dict[str, Any]) -> str:
|
def _extract_markdown_from_response(self, response: Dict[str, Any]) -> str:
|
||||||
"""
|
"""
|
||||||
|
|
@ -171,11 +183,9 @@ class PdfDocumentProcessor(DocumentProcessor):
|
||||||
logger.info("Starting PDF content processing with Mineru API")
|
logger.info("Starting PDF content processing with Mineru API")
|
||||||
|
|
||||||
# Call Mineru API to convert PDF to markdown
|
# Call Mineru API to convert PDF to markdown
|
||||||
|
# This will raise an exception if the API call fails
|
||||||
mineru_response = self._call_mineru_api(self.input_path)
|
mineru_response = self._call_mineru_api(self.input_path)
|
||||||
|
|
||||||
if not mineru_response:
|
|
||||||
raise Exception("Failed to get response from Mineru API")
|
|
||||||
|
|
||||||
# Extract markdown content from the response
|
# Extract markdown content from the response
|
||||||
markdown_content = self._extract_markdown_from_response(mineru_response)
|
markdown_content = self._extract_markdown_from_response(mineru_response)
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -13,7 +13,7 @@ class DocumentService:
|
||||||
processor = DocumentProcessorFactory.create_processor(input_path, output_path)
|
processor = DocumentProcessorFactory.create_processor(input_path, output_path)
|
||||||
if not processor:
|
if not processor:
|
||||||
logger.error(f"Unsupported file format: {input_path}")
|
logger.error(f"Unsupported file format: {input_path}")
|
||||||
return False
|
raise Exception(f"Unsupported file format: {input_path}")
|
||||||
|
|
||||||
# Read content
|
# Read content
|
||||||
content = processor.read_content()
|
content = processor.read_content()
|
||||||
|
|
@ -27,4 +27,5 @@ class DocumentService:
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(f"Error processing document {input_path}: {str(e)}")
|
logger.error(f"Error processing document {input_path}: {str(e)}")
|
||||||
return False
|
# Re-raise the exception so the Celery task can handle it properly
|
||||||
|
raise
|
||||||
|
|
@ -70,6 +70,7 @@ def process_file(file_id: str):
|
||||||
output_path = str(settings.PROCESSED_FOLDER / output_filename)
|
output_path = str(settings.PROCESSED_FOLDER / output_filename)
|
||||||
|
|
||||||
# Process document with both input and output paths
|
# Process document with both input and output paths
|
||||||
|
# This will raise an exception if processing fails
|
||||||
process_service.process_document(file.original_path, output_path)
|
process_service.process_document(file.original_path, output_path)
|
||||||
|
|
||||||
# Update file record with processed path
|
# Update file record with processed path
|
||||||
|
|
@ -81,6 +82,7 @@ def process_file(file_id: str):
|
||||||
file.status = FileStatus.FAILED
|
file.status = FileStatus.FAILED
|
||||||
file.error_message = str(e)
|
file.error_message = str(e)
|
||||||
db.commit()
|
db.commit()
|
||||||
|
# Re-raise the exception to ensure Celery marks the task as failed
|
||||||
raise
|
raise
|
||||||
|
|
||||||
finally:
|
finally:
|
||||||
|
|
|
||||||
|
|
@ -34,7 +34,6 @@ services:
|
||||||
- "8000:8000"
|
- "8000:8000"
|
||||||
volumes:
|
volumes:
|
||||||
- ./backend/storage:/app/storage
|
- ./backend/storage:/app/storage
|
||||||
- ./backend/legal_doc_masker.db:/app/legal_doc_masker.db
|
|
||||||
env_file:
|
env_file:
|
||||||
- ./backend/.env
|
- ./backend/.env
|
||||||
environment:
|
environment:
|
||||||
|
|
@ -55,7 +54,6 @@ services:
|
||||||
command: celery -A app.services.file_service worker --loglevel=info
|
command: celery -A app.services.file_service worker --loglevel=info
|
||||||
volumes:
|
volumes:
|
||||||
- ./backend/storage:/app/storage
|
- ./backend/storage:/app/storage
|
||||||
- ./backend/legal_doc_masker.db:/app/legal_doc_masker.db
|
|
||||||
env_file:
|
env_file:
|
||||||
- ./backend/.env
|
- ./backend/.env
|
||||||
environment:
|
environment:
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue