feat: 正式fully支持docx

This commit is contained in:
tigermren 2025-08-18 01:15:40 +08:00
parent afddcf4dd7
commit 2075218955
2 changed files with 63 additions and 36 deletions

View File

@ -26,18 +26,19 @@ class DocxDocumentProcessor(DocumentProcessor):
self.ollama_client = OllamaClient(model_name=settings.OLLAMA_MODEL, base_url=settings.OLLAMA_API_URL)
# Mineru API configuration
self.mineru_base_url = getattr(settings, 'MINERU_API_URL', 'http://mineru-api:8000')
self.mineru_timeout = getattr(settings, 'MINERU_TIMEOUT', 300) # 5 minutes timeout
self.mineru_lang_list = getattr(settings, 'MINERU_LANG_LIST', ['ch'])
self.mineru_backend = getattr(settings, 'MINERU_BACKEND', 'pipeline')
self.mineru_parse_method = getattr(settings, 'MINERU_PARSE_METHOD', 'auto')
self.mineru_formula_enable = getattr(settings, 'MINERU_FORMULA_ENABLE', True)
self.mineru_table_enable = getattr(settings, 'MINERU_TABLE_ENABLE', True)
# MagicDoc API configuration (replacing Mineru)
self.magicdoc_base_url = getattr(settings, 'MAGICDOC_API_URL', 'http://magicdoc-api:8000')
self.magicdoc_timeout = getattr(settings, 'MAGICDOC_TIMEOUT', 300) # 5 minutes timeout
# MagicDoc uses simpler parameters, but we keep compatibility with existing interface
self.magicdoc_lang_list = getattr(settings, 'MAGICDOC_LANG_LIST', 'ch')
self.magicdoc_backend = getattr(settings, 'MAGICDOC_BACKEND', 'pipeline')
self.magicdoc_parse_method = getattr(settings, 'MAGICDOC_PARSE_METHOD', 'auto')
self.magicdoc_formula_enable = getattr(settings, 'MAGICDOC_FORMULA_ENABLE', True)
self.magicdoc_table_enable = getattr(settings, 'MAGICDOC_TABLE_ENABLE', True)
def _call_mineru_api(self, file_path: str) -> Optional[Dict[str, Any]]:
def _call_magicdoc_api(self, file_path: str) -> Optional[Dict[str, Any]]:
"""
Call Mineru API to convert DOCX to markdown
Call MagicDoc API to convert DOCX to markdown
Args:
file_path: Path to the DOCX file
@ -46,19 +47,19 @@ class DocxDocumentProcessor(DocumentProcessor):
API response as dictionary or None if failed
"""
try:
url = f"{self.mineru_base_url}/file_parse"
url = f"{self.magicdoc_base_url}/file_parse"
with open(file_path, 'rb') as file:
files = {'files': (os.path.basename(file_path), file, 'application/vnd.openxmlformats-officedocument.wordprocessingml.document')}
# Prepare form data according to Mineru API specification
# Prepare form data according to MagicDoc API specification (compatible with Mineru)
data = {
'output_dir': './output',
'lang_list': self.mineru_lang_list,
'backend': self.mineru_backend,
'parse_method': self.mineru_parse_method,
'formula_enable': self.mineru_formula_enable,
'table_enable': self.mineru_table_enable,
'lang_list': self.magicdoc_lang_list,
'backend': self.magicdoc_backend,
'parse_method': self.magicdoc_parse_method,
'formula_enable': self.magicdoc_formula_enable,
'table_enable': self.magicdoc_table_enable,
'return_md': True,
'return_middle_json': False,
'return_model_output': False,
@ -68,58 +69,58 @@ class DocxDocumentProcessor(DocumentProcessor):
'end_page_id': 99999
}
logger.info(f"Calling Mineru API for DOCX processing at {url}")
logger.info(f"Calling MagicDoc API for DOCX processing at {url}")
response = requests.post(
url,
files=files,
data=data,
timeout=self.mineru_timeout
timeout=self.magicdoc_timeout
)
if response.status_code == 200:
result = response.json()
logger.info("Successfully received response from Mineru API for DOCX")
logger.info("Successfully received response from MagicDoc API for DOCX")
return result
else:
error_msg = f"Mineru API returned status code {response.status_code}: {response.text}"
error_msg = f"MagicDoc API returned status code {response.status_code}: {response.text}"
logger.error(error_msg)
# For 400 errors, include more specific information
if response.status_code == 400:
try:
error_data = response.json()
if 'error' in error_data:
error_msg = f"Mineru API error: {error_data['error']}"
error_msg = f"MagicDoc API error: {error_data['error']}"
except:
pass
raise Exception(error_msg)
except requests.exceptions.Timeout:
error_msg = f"Mineru API request timed out after {self.mineru_timeout} seconds"
error_msg = f"MagicDoc API request timed out after {self.magicdoc_timeout} seconds"
logger.error(error_msg)
raise Exception(error_msg)
except requests.exceptions.RequestException as e:
error_msg = f"Error calling Mineru API for DOCX: {str(e)}"
error_msg = f"Error calling MagicDoc API for DOCX: {str(e)}"
logger.error(error_msg)
raise Exception(error_msg)
except Exception as e:
error_msg = f"Unexpected error calling Mineru API for DOCX: {str(e)}"
error_msg = f"Unexpected error calling MagicDoc API for DOCX: {str(e)}"
logger.error(error_msg)
raise Exception(error_msg)
def _extract_markdown_from_response(self, response: Dict[str, Any]) -> str:
"""
Extract markdown content from Mineru API response
Extract markdown content from MagicDoc API response
Args:
response: Mineru API response dictionary
response: MagicDoc API response dictionary
Returns:
Extracted markdown content as string
"""
try:
logger.debug(f"Mineru API response structure for DOCX: {response}")
logger.debug(f"MagicDoc API response structure for DOCX: {response}")
# Try different possible response formats based on Mineru API
# Try different possible response formats based on MagicDoc API
if 'markdown' in response:
return response['markdown']
elif 'md' in response:
@ -157,7 +158,7 @@ class DocxDocumentProcessor(DocumentProcessor):
return first_item
else:
# If no standard format found, try to extract from the response structure
logger.warning("Could not find standard markdown field in Mineru response for DOCX")
logger.warning("Could not find standard markdown field in MagicDoc response for DOCX")
# Return the response as string if it's simple, or empty string
if isinstance(response, str):
@ -176,21 +177,21 @@ class DocxDocumentProcessor(DocumentProcessor):
return ""
except Exception as e:
logger.error(f"Error extracting markdown from Mineru response for DOCX: {str(e)}")
logger.error(f"Error extracting markdown from MagicDoc response for DOCX: {str(e)}")
return ""
def read_content(self) -> str:
logger.info("Starting DOCX content processing with Mineru API")
logger.info("Starting DOCX content processing with MagicDoc API")
# Call Mineru API to convert DOCX to markdown
# Call MagicDoc API to convert DOCX to markdown
# This will raise an exception if the API call fails
mineru_response = self._call_mineru_api(self.input_path)
magicdoc_response = self._call_magicdoc_api(self.input_path)
# Extract markdown content from the response
markdown_content = self._extract_markdown_from_response(mineru_response)
markdown_content = self._extract_markdown_from_response(magicdoc_response)
if not markdown_content:
raise Exception("No markdown content found in Mineru API response for DOCX")
raise Exception("No markdown content found in MagicDoc API response for DOCX")
logger.info(f"Successfully extracted {len(markdown_content)} characters of markdown content from DOCX")

View File

@ -25,6 +25,29 @@ services:
networks:
- app-network
# MagicDoc API Service
magicdoc-api:
build:
context: ./magicdoc
dockerfile: Dockerfile
platform: linux/amd64
ports:
- "8002:8000"
volumes:
- ./magicdoc/storage/uploads:/app/storage/uploads
- ./magicdoc/storage/processed:/app/storage/processed
environment:
- PYTHONUNBUFFERED=1
restart: unless-stopped
healthcheck:
test: ["CMD", "curl", "-f", "http://localhost:8000/health"]
interval: 30s
timeout: 10s
retries: 3
start_period: 60s
networks:
- app-network
# Backend API Service
backend-api:
build:
@ -40,9 +63,11 @@ services:
- CELERY_BROKER_URL=redis://redis:6379/0
- CELERY_RESULT_BACKEND=redis://redis:6379/0
- MINERU_API_URL=http://mineru-api:8000
- MAGICDOC_API_URL=http://magicdoc-api:8000
depends_on:
- redis
- mineru-api
- magicdoc-api
networks:
- app-network
@ -60,6 +85,7 @@ services:
- CELERY_BROKER_URL=redis://redis:6379/0
- CELERY_RESULT_BACKEND=redis://redis:6379/0
- MINERU_API_URL=http://mineru-api:8000
- MAGICDOC_API_URL=http://magicdoc-api:8000
depends_on:
- redis
- backend-api