feat: 正式fully支持docx
This commit is contained in:
parent
afddcf4dd7
commit
2075218955
|
|
@ -26,18 +26,19 @@ class DocxDocumentProcessor(DocumentProcessor):
|
||||||
|
|
||||||
self.ollama_client = OllamaClient(model_name=settings.OLLAMA_MODEL, base_url=settings.OLLAMA_API_URL)
|
self.ollama_client = OllamaClient(model_name=settings.OLLAMA_MODEL, base_url=settings.OLLAMA_API_URL)
|
||||||
|
|
||||||
# Mineru API configuration
|
# MagicDoc API configuration (replacing Mineru)
|
||||||
self.mineru_base_url = getattr(settings, 'MINERU_API_URL', 'http://mineru-api:8000')
|
self.magicdoc_base_url = getattr(settings, 'MAGICDOC_API_URL', 'http://magicdoc-api:8000')
|
||||||
self.mineru_timeout = getattr(settings, 'MINERU_TIMEOUT', 300) # 5 minutes timeout
|
self.magicdoc_timeout = getattr(settings, 'MAGICDOC_TIMEOUT', 300) # 5 minutes timeout
|
||||||
self.mineru_lang_list = getattr(settings, 'MINERU_LANG_LIST', ['ch'])
|
# MagicDoc uses simpler parameters, but we keep compatibility with existing interface
|
||||||
self.mineru_backend = getattr(settings, 'MINERU_BACKEND', 'pipeline')
|
self.magicdoc_lang_list = getattr(settings, 'MAGICDOC_LANG_LIST', 'ch')
|
||||||
self.mineru_parse_method = getattr(settings, 'MINERU_PARSE_METHOD', 'auto')
|
self.magicdoc_backend = getattr(settings, 'MAGICDOC_BACKEND', 'pipeline')
|
||||||
self.mineru_formula_enable = getattr(settings, 'MINERU_FORMULA_ENABLE', True)
|
self.magicdoc_parse_method = getattr(settings, 'MAGICDOC_PARSE_METHOD', 'auto')
|
||||||
self.mineru_table_enable = getattr(settings, 'MINERU_TABLE_ENABLE', True)
|
self.magicdoc_formula_enable = getattr(settings, 'MAGICDOC_FORMULA_ENABLE', True)
|
||||||
|
self.magicdoc_table_enable = getattr(settings, 'MAGICDOC_TABLE_ENABLE', True)
|
||||||
|
|
||||||
def _call_mineru_api(self, file_path: str) -> Optional[Dict[str, Any]]:
|
def _call_magicdoc_api(self, file_path: str) -> Optional[Dict[str, Any]]:
|
||||||
"""
|
"""
|
||||||
Call Mineru API to convert DOCX to markdown
|
Call MagicDoc API to convert DOCX to markdown
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
file_path: Path to the DOCX file
|
file_path: Path to the DOCX file
|
||||||
|
|
@ -46,19 +47,19 @@ class DocxDocumentProcessor(DocumentProcessor):
|
||||||
API response as dictionary or None if failed
|
API response as dictionary or None if failed
|
||||||
"""
|
"""
|
||||||
try:
|
try:
|
||||||
url = f"{self.mineru_base_url}/file_parse"
|
url = f"{self.magicdoc_base_url}/file_parse"
|
||||||
|
|
||||||
with open(file_path, 'rb') as file:
|
with open(file_path, 'rb') as file:
|
||||||
files = {'files': (os.path.basename(file_path), file, 'application/vnd.openxmlformats-officedocument.wordprocessingml.document')}
|
files = {'files': (os.path.basename(file_path), file, 'application/vnd.openxmlformats-officedocument.wordprocessingml.document')}
|
||||||
|
|
||||||
# Prepare form data according to Mineru API specification
|
# Prepare form data according to MagicDoc API specification (compatible with Mineru)
|
||||||
data = {
|
data = {
|
||||||
'output_dir': './output',
|
'output_dir': './output',
|
||||||
'lang_list': self.mineru_lang_list,
|
'lang_list': self.magicdoc_lang_list,
|
||||||
'backend': self.mineru_backend,
|
'backend': self.magicdoc_backend,
|
||||||
'parse_method': self.mineru_parse_method,
|
'parse_method': self.magicdoc_parse_method,
|
||||||
'formula_enable': self.mineru_formula_enable,
|
'formula_enable': self.magicdoc_formula_enable,
|
||||||
'table_enable': self.mineru_table_enable,
|
'table_enable': self.magicdoc_table_enable,
|
||||||
'return_md': True,
|
'return_md': True,
|
||||||
'return_middle_json': False,
|
'return_middle_json': False,
|
||||||
'return_model_output': False,
|
'return_model_output': False,
|
||||||
|
|
@ -68,58 +69,58 @@ class DocxDocumentProcessor(DocumentProcessor):
|
||||||
'end_page_id': 99999
|
'end_page_id': 99999
|
||||||
}
|
}
|
||||||
|
|
||||||
logger.info(f"Calling Mineru API for DOCX processing at {url}")
|
logger.info(f"Calling MagicDoc API for DOCX processing at {url}")
|
||||||
response = requests.post(
|
response = requests.post(
|
||||||
url,
|
url,
|
||||||
files=files,
|
files=files,
|
||||||
data=data,
|
data=data,
|
||||||
timeout=self.mineru_timeout
|
timeout=self.magicdoc_timeout
|
||||||
)
|
)
|
||||||
|
|
||||||
if response.status_code == 200:
|
if response.status_code == 200:
|
||||||
result = response.json()
|
result = response.json()
|
||||||
logger.info("Successfully received response from Mineru API for DOCX")
|
logger.info("Successfully received response from MagicDoc API for DOCX")
|
||||||
return result
|
return result
|
||||||
else:
|
else:
|
||||||
error_msg = f"Mineru API returned status code {response.status_code}: {response.text}"
|
error_msg = f"MagicDoc API returned status code {response.status_code}: {response.text}"
|
||||||
logger.error(error_msg)
|
logger.error(error_msg)
|
||||||
# For 400 errors, include more specific information
|
# For 400 errors, include more specific information
|
||||||
if response.status_code == 400:
|
if response.status_code == 400:
|
||||||
try:
|
try:
|
||||||
error_data = response.json()
|
error_data = response.json()
|
||||||
if 'error' in error_data:
|
if 'error' in error_data:
|
||||||
error_msg = f"Mineru API error: {error_data['error']}"
|
error_msg = f"MagicDoc API error: {error_data['error']}"
|
||||||
except:
|
except:
|
||||||
pass
|
pass
|
||||||
raise Exception(error_msg)
|
raise Exception(error_msg)
|
||||||
|
|
||||||
except requests.exceptions.Timeout:
|
except requests.exceptions.Timeout:
|
||||||
error_msg = f"Mineru API request timed out after {self.mineru_timeout} seconds"
|
error_msg = f"MagicDoc API request timed out after {self.magicdoc_timeout} seconds"
|
||||||
logger.error(error_msg)
|
logger.error(error_msg)
|
||||||
raise Exception(error_msg)
|
raise Exception(error_msg)
|
||||||
except requests.exceptions.RequestException as e:
|
except requests.exceptions.RequestException as e:
|
||||||
error_msg = f"Error calling Mineru API for DOCX: {str(e)}"
|
error_msg = f"Error calling MagicDoc API for DOCX: {str(e)}"
|
||||||
logger.error(error_msg)
|
logger.error(error_msg)
|
||||||
raise Exception(error_msg)
|
raise Exception(error_msg)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
error_msg = f"Unexpected error calling Mineru API for DOCX: {str(e)}"
|
error_msg = f"Unexpected error calling MagicDoc API for DOCX: {str(e)}"
|
||||||
logger.error(error_msg)
|
logger.error(error_msg)
|
||||||
raise Exception(error_msg)
|
raise Exception(error_msg)
|
||||||
|
|
||||||
def _extract_markdown_from_response(self, response: Dict[str, Any]) -> str:
|
def _extract_markdown_from_response(self, response: Dict[str, Any]) -> str:
|
||||||
"""
|
"""
|
||||||
Extract markdown content from Mineru API response
|
Extract markdown content from MagicDoc API response
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
response: Mineru API response dictionary
|
response: MagicDoc API response dictionary
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
Extracted markdown content as string
|
Extracted markdown content as string
|
||||||
"""
|
"""
|
||||||
try:
|
try:
|
||||||
logger.debug(f"Mineru API response structure for DOCX: {response}")
|
logger.debug(f"MagicDoc API response structure for DOCX: {response}")
|
||||||
|
|
||||||
# Try different possible response formats based on Mineru API
|
# Try different possible response formats based on MagicDoc API
|
||||||
if 'markdown' in response:
|
if 'markdown' in response:
|
||||||
return response['markdown']
|
return response['markdown']
|
||||||
elif 'md' in response:
|
elif 'md' in response:
|
||||||
|
|
@ -157,7 +158,7 @@ class DocxDocumentProcessor(DocumentProcessor):
|
||||||
return first_item
|
return first_item
|
||||||
else:
|
else:
|
||||||
# If no standard format found, try to extract from the response structure
|
# If no standard format found, try to extract from the response structure
|
||||||
logger.warning("Could not find standard markdown field in Mineru response for DOCX")
|
logger.warning("Could not find standard markdown field in MagicDoc response for DOCX")
|
||||||
|
|
||||||
# Return the response as string if it's simple, or empty string
|
# Return the response as string if it's simple, or empty string
|
||||||
if isinstance(response, str):
|
if isinstance(response, str):
|
||||||
|
|
@ -176,21 +177,21 @@ class DocxDocumentProcessor(DocumentProcessor):
|
||||||
return ""
|
return ""
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(f"Error extracting markdown from Mineru response for DOCX: {str(e)}")
|
logger.error(f"Error extracting markdown from MagicDoc response for DOCX: {str(e)}")
|
||||||
return ""
|
return ""
|
||||||
|
|
||||||
def read_content(self) -> str:
|
def read_content(self) -> str:
|
||||||
logger.info("Starting DOCX content processing with Mineru API")
|
logger.info("Starting DOCX content processing with MagicDoc API")
|
||||||
|
|
||||||
# Call Mineru API to convert DOCX to markdown
|
# Call MagicDoc API to convert DOCX to markdown
|
||||||
# This will raise an exception if the API call fails
|
# This will raise an exception if the API call fails
|
||||||
mineru_response = self._call_mineru_api(self.input_path)
|
magicdoc_response = self._call_magicdoc_api(self.input_path)
|
||||||
|
|
||||||
# Extract markdown content from the response
|
# Extract markdown content from the response
|
||||||
markdown_content = self._extract_markdown_from_response(mineru_response)
|
markdown_content = self._extract_markdown_from_response(magicdoc_response)
|
||||||
|
|
||||||
if not markdown_content:
|
if not markdown_content:
|
||||||
raise Exception("No markdown content found in Mineru API response for DOCX")
|
raise Exception("No markdown content found in MagicDoc API response for DOCX")
|
||||||
|
|
||||||
logger.info(f"Successfully extracted {len(markdown_content)} characters of markdown content from DOCX")
|
logger.info(f"Successfully extracted {len(markdown_content)} characters of markdown content from DOCX")
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -25,6 +25,29 @@ services:
|
||||||
networks:
|
networks:
|
||||||
- app-network
|
- app-network
|
||||||
|
|
||||||
|
# MagicDoc API Service
|
||||||
|
magicdoc-api:
|
||||||
|
build:
|
||||||
|
context: ./magicdoc
|
||||||
|
dockerfile: Dockerfile
|
||||||
|
platform: linux/amd64
|
||||||
|
ports:
|
||||||
|
- "8002:8000"
|
||||||
|
volumes:
|
||||||
|
- ./magicdoc/storage/uploads:/app/storage/uploads
|
||||||
|
- ./magicdoc/storage/processed:/app/storage/processed
|
||||||
|
environment:
|
||||||
|
- PYTHONUNBUFFERED=1
|
||||||
|
restart: unless-stopped
|
||||||
|
healthcheck:
|
||||||
|
test: ["CMD", "curl", "-f", "http://localhost:8000/health"]
|
||||||
|
interval: 30s
|
||||||
|
timeout: 10s
|
||||||
|
retries: 3
|
||||||
|
start_period: 60s
|
||||||
|
networks:
|
||||||
|
- app-network
|
||||||
|
|
||||||
# Backend API Service
|
# Backend API Service
|
||||||
backend-api:
|
backend-api:
|
||||||
build:
|
build:
|
||||||
|
|
@ -40,9 +63,11 @@ services:
|
||||||
- CELERY_BROKER_URL=redis://redis:6379/0
|
- CELERY_BROKER_URL=redis://redis:6379/0
|
||||||
- CELERY_RESULT_BACKEND=redis://redis:6379/0
|
- CELERY_RESULT_BACKEND=redis://redis:6379/0
|
||||||
- MINERU_API_URL=http://mineru-api:8000
|
- MINERU_API_URL=http://mineru-api:8000
|
||||||
|
- MAGICDOC_API_URL=http://magicdoc-api:8000
|
||||||
depends_on:
|
depends_on:
|
||||||
- redis
|
- redis
|
||||||
- mineru-api
|
- mineru-api
|
||||||
|
- magicdoc-api
|
||||||
networks:
|
networks:
|
||||||
- app-network
|
- app-network
|
||||||
|
|
||||||
|
|
@ -60,6 +85,7 @@ services:
|
||||||
- CELERY_BROKER_URL=redis://redis:6379/0
|
- CELERY_BROKER_URL=redis://redis:6379/0
|
||||||
- CELERY_RESULT_BACKEND=redis://redis:6379/0
|
- CELERY_RESULT_BACKEND=redis://redis:6379/0
|
||||||
- MINERU_API_URL=http://mineru-api:8000
|
- MINERU_API_URL=http://mineru-api:8000
|
||||||
|
- MAGICDOC_API_URL=http://magicdoc-api:8000
|
||||||
depends_on:
|
depends_on:
|
||||||
- redis
|
- redis
|
||||||
- backend-api
|
- backend-api
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue