feat: 正式fully支持docx
This commit is contained in:
parent
afddcf4dd7
commit
2075218955
|
|
@ -26,18 +26,19 @@ class DocxDocumentProcessor(DocumentProcessor):
|
|||
|
||||
self.ollama_client = OllamaClient(model_name=settings.OLLAMA_MODEL, base_url=settings.OLLAMA_API_URL)
|
||||
|
||||
# Mineru API configuration
|
||||
self.mineru_base_url = getattr(settings, 'MINERU_API_URL', 'http://mineru-api:8000')
|
||||
self.mineru_timeout = getattr(settings, 'MINERU_TIMEOUT', 300) # 5 minutes timeout
|
||||
self.mineru_lang_list = getattr(settings, 'MINERU_LANG_LIST', ['ch'])
|
||||
self.mineru_backend = getattr(settings, 'MINERU_BACKEND', 'pipeline')
|
||||
self.mineru_parse_method = getattr(settings, 'MINERU_PARSE_METHOD', 'auto')
|
||||
self.mineru_formula_enable = getattr(settings, 'MINERU_FORMULA_ENABLE', True)
|
||||
self.mineru_table_enable = getattr(settings, 'MINERU_TABLE_ENABLE', True)
|
||||
# MagicDoc API configuration (replacing Mineru)
|
||||
self.magicdoc_base_url = getattr(settings, 'MAGICDOC_API_URL', 'http://magicdoc-api:8000')
|
||||
self.magicdoc_timeout = getattr(settings, 'MAGICDOC_TIMEOUT', 300) # 5 minutes timeout
|
||||
# MagicDoc uses simpler parameters, but we keep compatibility with existing interface
|
||||
self.magicdoc_lang_list = getattr(settings, 'MAGICDOC_LANG_LIST', 'ch')
|
||||
self.magicdoc_backend = getattr(settings, 'MAGICDOC_BACKEND', 'pipeline')
|
||||
self.magicdoc_parse_method = getattr(settings, 'MAGICDOC_PARSE_METHOD', 'auto')
|
||||
self.magicdoc_formula_enable = getattr(settings, 'MAGICDOC_FORMULA_ENABLE', True)
|
||||
self.magicdoc_table_enable = getattr(settings, 'MAGICDOC_TABLE_ENABLE', True)
|
||||
|
||||
def _call_mineru_api(self, file_path: str) -> Optional[Dict[str, Any]]:
|
||||
def _call_magicdoc_api(self, file_path: str) -> Optional[Dict[str, Any]]:
|
||||
"""
|
||||
Call Mineru API to convert DOCX to markdown
|
||||
Call MagicDoc API to convert DOCX to markdown
|
||||
|
||||
Args:
|
||||
file_path: Path to the DOCX file
|
||||
|
|
@ -46,19 +47,19 @@ class DocxDocumentProcessor(DocumentProcessor):
|
|||
API response as dictionary or None if failed
|
||||
"""
|
||||
try:
|
||||
url = f"{self.mineru_base_url}/file_parse"
|
||||
url = f"{self.magicdoc_base_url}/file_parse"
|
||||
|
||||
with open(file_path, 'rb') as file:
|
||||
files = {'files': (os.path.basename(file_path), file, 'application/vnd.openxmlformats-officedocument.wordprocessingml.document')}
|
||||
|
||||
# Prepare form data according to Mineru API specification
|
||||
# Prepare form data according to MagicDoc API specification (compatible with Mineru)
|
||||
data = {
|
||||
'output_dir': './output',
|
||||
'lang_list': self.mineru_lang_list,
|
||||
'backend': self.mineru_backend,
|
||||
'parse_method': self.mineru_parse_method,
|
||||
'formula_enable': self.mineru_formula_enable,
|
||||
'table_enable': self.mineru_table_enable,
|
||||
'lang_list': self.magicdoc_lang_list,
|
||||
'backend': self.magicdoc_backend,
|
||||
'parse_method': self.magicdoc_parse_method,
|
||||
'formula_enable': self.magicdoc_formula_enable,
|
||||
'table_enable': self.magicdoc_table_enable,
|
||||
'return_md': True,
|
||||
'return_middle_json': False,
|
||||
'return_model_output': False,
|
||||
|
|
@ -68,58 +69,58 @@ class DocxDocumentProcessor(DocumentProcessor):
|
|||
'end_page_id': 99999
|
||||
}
|
||||
|
||||
logger.info(f"Calling Mineru API for DOCX processing at {url}")
|
||||
logger.info(f"Calling MagicDoc API for DOCX processing at {url}")
|
||||
response = requests.post(
|
||||
url,
|
||||
files=files,
|
||||
data=data,
|
||||
timeout=self.mineru_timeout
|
||||
timeout=self.magicdoc_timeout
|
||||
)
|
||||
|
||||
if response.status_code == 200:
|
||||
result = response.json()
|
||||
logger.info("Successfully received response from Mineru API for DOCX")
|
||||
logger.info("Successfully received response from MagicDoc API for DOCX")
|
||||
return result
|
||||
else:
|
||||
error_msg = f"Mineru API returned status code {response.status_code}: {response.text}"
|
||||
error_msg = f"MagicDoc API returned status code {response.status_code}: {response.text}"
|
||||
logger.error(error_msg)
|
||||
# For 400 errors, include more specific information
|
||||
if response.status_code == 400:
|
||||
try:
|
||||
error_data = response.json()
|
||||
if 'error' in error_data:
|
||||
error_msg = f"Mineru API error: {error_data['error']}"
|
||||
error_msg = f"MagicDoc API error: {error_data['error']}"
|
||||
except:
|
||||
pass
|
||||
raise Exception(error_msg)
|
||||
|
||||
except requests.exceptions.Timeout:
|
||||
error_msg = f"Mineru API request timed out after {self.mineru_timeout} seconds"
|
||||
error_msg = f"MagicDoc API request timed out after {self.magicdoc_timeout} seconds"
|
||||
logger.error(error_msg)
|
||||
raise Exception(error_msg)
|
||||
except requests.exceptions.RequestException as e:
|
||||
error_msg = f"Error calling Mineru API for DOCX: {str(e)}"
|
||||
error_msg = f"Error calling MagicDoc API for DOCX: {str(e)}"
|
||||
logger.error(error_msg)
|
||||
raise Exception(error_msg)
|
||||
except Exception as e:
|
||||
error_msg = f"Unexpected error calling Mineru API for DOCX: {str(e)}"
|
||||
error_msg = f"Unexpected error calling MagicDoc API for DOCX: {str(e)}"
|
||||
logger.error(error_msg)
|
||||
raise Exception(error_msg)
|
||||
|
||||
def _extract_markdown_from_response(self, response: Dict[str, Any]) -> str:
|
||||
"""
|
||||
Extract markdown content from Mineru API response
|
||||
Extract markdown content from MagicDoc API response
|
||||
|
||||
Args:
|
||||
response: Mineru API response dictionary
|
||||
response: MagicDoc API response dictionary
|
||||
|
||||
Returns:
|
||||
Extracted markdown content as string
|
||||
"""
|
||||
try:
|
||||
logger.debug(f"Mineru API response structure for DOCX: {response}")
|
||||
logger.debug(f"MagicDoc API response structure for DOCX: {response}")
|
||||
|
||||
# Try different possible response formats based on Mineru API
|
||||
# Try different possible response formats based on MagicDoc API
|
||||
if 'markdown' in response:
|
||||
return response['markdown']
|
||||
elif 'md' in response:
|
||||
|
|
@ -157,7 +158,7 @@ class DocxDocumentProcessor(DocumentProcessor):
|
|||
return first_item
|
||||
else:
|
||||
# If no standard format found, try to extract from the response structure
|
||||
logger.warning("Could not find standard markdown field in Mineru response for DOCX")
|
||||
logger.warning("Could not find standard markdown field in MagicDoc response for DOCX")
|
||||
|
||||
# Return the response as string if it's simple, or empty string
|
||||
if isinstance(response, str):
|
||||
|
|
@ -176,21 +177,21 @@ class DocxDocumentProcessor(DocumentProcessor):
|
|||
return ""
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error extracting markdown from Mineru response for DOCX: {str(e)}")
|
||||
logger.error(f"Error extracting markdown from MagicDoc response for DOCX: {str(e)}")
|
||||
return ""
|
||||
|
||||
def read_content(self) -> str:
|
||||
logger.info("Starting DOCX content processing with Mineru API")
|
||||
logger.info("Starting DOCX content processing with MagicDoc API")
|
||||
|
||||
# Call Mineru API to convert DOCX to markdown
|
||||
# Call MagicDoc API to convert DOCX to markdown
|
||||
# This will raise an exception if the API call fails
|
||||
mineru_response = self._call_mineru_api(self.input_path)
|
||||
magicdoc_response = self._call_magicdoc_api(self.input_path)
|
||||
|
||||
# Extract markdown content from the response
|
||||
markdown_content = self._extract_markdown_from_response(mineru_response)
|
||||
markdown_content = self._extract_markdown_from_response(magicdoc_response)
|
||||
|
||||
if not markdown_content:
|
||||
raise Exception("No markdown content found in Mineru API response for DOCX")
|
||||
raise Exception("No markdown content found in MagicDoc API response for DOCX")
|
||||
|
||||
logger.info(f"Successfully extracted {len(markdown_content)} characters of markdown content from DOCX")
|
||||
|
||||
|
|
|
|||
|
|
@ -25,6 +25,29 @@ services:
|
|||
networks:
|
||||
- app-network
|
||||
|
||||
# MagicDoc API Service
|
||||
magicdoc-api:
|
||||
build:
|
||||
context: ./magicdoc
|
||||
dockerfile: Dockerfile
|
||||
platform: linux/amd64
|
||||
ports:
|
||||
- "8002:8000"
|
||||
volumes:
|
||||
- ./magicdoc/storage/uploads:/app/storage/uploads
|
||||
- ./magicdoc/storage/processed:/app/storage/processed
|
||||
environment:
|
||||
- PYTHONUNBUFFERED=1
|
||||
restart: unless-stopped
|
||||
healthcheck:
|
||||
test: ["CMD", "curl", "-f", "http://localhost:8000/health"]
|
||||
interval: 30s
|
||||
timeout: 10s
|
||||
retries: 3
|
||||
start_period: 60s
|
||||
networks:
|
||||
- app-network
|
||||
|
||||
# Backend API Service
|
||||
backend-api:
|
||||
build:
|
||||
|
|
@ -40,9 +63,11 @@ services:
|
|||
- CELERY_BROKER_URL=redis://redis:6379/0
|
||||
- CELERY_RESULT_BACKEND=redis://redis:6379/0
|
||||
- MINERU_API_URL=http://mineru-api:8000
|
||||
- MAGICDOC_API_URL=http://magicdoc-api:8000
|
||||
depends_on:
|
||||
- redis
|
||||
- mineru-api
|
||||
- magicdoc-api
|
||||
networks:
|
||||
- app-network
|
||||
|
||||
|
|
@ -60,6 +85,7 @@ services:
|
|||
- CELERY_BROKER_URL=redis://redis:6379/0
|
||||
- CELERY_RESULT_BACKEND=redis://redis:6379/0
|
||||
- MINERU_API_URL=http://mineru-api:8000
|
||||
- MAGICDOC_API_URL=http://magicdoc-api:8000
|
||||
depends_on:
|
||||
- redis
|
||||
- backend-api
|
||||
|
|
|
|||
Loading…
Reference in New Issue