Compare commits
4 Commits
84499f52ea
...
2075218955
| Author | SHA1 | Date |
|---|---|---|
|
|
2075218955 | |
|
|
afddcf4dd7 | |
|
|
0820d7bba2 | |
|
|
a16b69475e |
|
|
@ -26,18 +26,19 @@ class DocxDocumentProcessor(DocumentProcessor):
|
||||||
|
|
||||||
self.ollama_client = OllamaClient(model_name=settings.OLLAMA_MODEL, base_url=settings.OLLAMA_API_URL)
|
self.ollama_client = OllamaClient(model_name=settings.OLLAMA_MODEL, base_url=settings.OLLAMA_API_URL)
|
||||||
|
|
||||||
# Mineru API configuration
|
# MagicDoc API configuration (replacing Mineru)
|
||||||
self.mineru_base_url = getattr(settings, 'MINERU_API_URL', 'http://mineru-api:8000')
|
self.magicdoc_base_url = getattr(settings, 'MAGICDOC_API_URL', 'http://magicdoc-api:8000')
|
||||||
self.mineru_timeout = getattr(settings, 'MINERU_TIMEOUT', 300) # 5 minutes timeout
|
self.magicdoc_timeout = getattr(settings, 'MAGICDOC_TIMEOUT', 300) # 5 minutes timeout
|
||||||
self.mineru_lang_list = getattr(settings, 'MINERU_LANG_LIST', ['ch'])
|
# MagicDoc uses simpler parameters, but we keep compatibility with existing interface
|
||||||
self.mineru_backend = getattr(settings, 'MINERU_BACKEND', 'pipeline')
|
self.magicdoc_lang_list = getattr(settings, 'MAGICDOC_LANG_LIST', 'ch')
|
||||||
self.mineru_parse_method = getattr(settings, 'MINERU_PARSE_METHOD', 'auto')
|
self.magicdoc_backend = getattr(settings, 'MAGICDOC_BACKEND', 'pipeline')
|
||||||
self.mineru_formula_enable = getattr(settings, 'MINERU_FORMULA_ENABLE', True)
|
self.magicdoc_parse_method = getattr(settings, 'MAGICDOC_PARSE_METHOD', 'auto')
|
||||||
self.mineru_table_enable = getattr(settings, 'MINERU_TABLE_ENABLE', True)
|
self.magicdoc_formula_enable = getattr(settings, 'MAGICDOC_FORMULA_ENABLE', True)
|
||||||
|
self.magicdoc_table_enable = getattr(settings, 'MAGICDOC_TABLE_ENABLE', True)
|
||||||
|
|
||||||
def _call_mineru_api(self, file_path: str) -> Optional[Dict[str, Any]]:
|
def _call_magicdoc_api(self, file_path: str) -> Optional[Dict[str, Any]]:
|
||||||
"""
|
"""
|
||||||
Call Mineru API to convert DOCX to markdown
|
Call MagicDoc API to convert DOCX to markdown
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
file_path: Path to the DOCX file
|
file_path: Path to the DOCX file
|
||||||
|
|
@ -46,19 +47,19 @@ class DocxDocumentProcessor(DocumentProcessor):
|
||||||
API response as dictionary or None if failed
|
API response as dictionary or None if failed
|
||||||
"""
|
"""
|
||||||
try:
|
try:
|
||||||
url = f"{self.mineru_base_url}/file_parse"
|
url = f"{self.magicdoc_base_url}/file_parse"
|
||||||
|
|
||||||
with open(file_path, 'rb') as file:
|
with open(file_path, 'rb') as file:
|
||||||
files = {'files': (os.path.basename(file_path), file, 'application/vnd.openxmlformats-officedocument.wordprocessingml.document')}
|
files = {'files': (os.path.basename(file_path), file, 'application/vnd.openxmlformats-officedocument.wordprocessingml.document')}
|
||||||
|
|
||||||
# Prepare form data according to Mineru API specification
|
# Prepare form data according to MagicDoc API specification (compatible with Mineru)
|
||||||
data = {
|
data = {
|
||||||
'output_dir': './output',
|
'output_dir': './output',
|
||||||
'lang_list': self.mineru_lang_list,
|
'lang_list': self.magicdoc_lang_list,
|
||||||
'backend': self.mineru_backend,
|
'backend': self.magicdoc_backend,
|
||||||
'parse_method': self.mineru_parse_method,
|
'parse_method': self.magicdoc_parse_method,
|
||||||
'formula_enable': self.mineru_formula_enable,
|
'formula_enable': self.magicdoc_formula_enable,
|
||||||
'table_enable': self.mineru_table_enable,
|
'table_enable': self.magicdoc_table_enable,
|
||||||
'return_md': True,
|
'return_md': True,
|
||||||
'return_middle_json': False,
|
'return_middle_json': False,
|
||||||
'return_model_output': False,
|
'return_model_output': False,
|
||||||
|
|
@ -68,58 +69,58 @@ class DocxDocumentProcessor(DocumentProcessor):
|
||||||
'end_page_id': 99999
|
'end_page_id': 99999
|
||||||
}
|
}
|
||||||
|
|
||||||
logger.info(f"Calling Mineru API for DOCX processing at {url}")
|
logger.info(f"Calling MagicDoc API for DOCX processing at {url}")
|
||||||
response = requests.post(
|
response = requests.post(
|
||||||
url,
|
url,
|
||||||
files=files,
|
files=files,
|
||||||
data=data,
|
data=data,
|
||||||
timeout=self.mineru_timeout
|
timeout=self.magicdoc_timeout
|
||||||
)
|
)
|
||||||
|
|
||||||
if response.status_code == 200:
|
if response.status_code == 200:
|
||||||
result = response.json()
|
result = response.json()
|
||||||
logger.info("Successfully received response from Mineru API for DOCX")
|
logger.info("Successfully received response from MagicDoc API for DOCX")
|
||||||
return result
|
return result
|
||||||
else:
|
else:
|
||||||
error_msg = f"Mineru API returned status code {response.status_code}: {response.text}"
|
error_msg = f"MagicDoc API returned status code {response.status_code}: {response.text}"
|
||||||
logger.error(error_msg)
|
logger.error(error_msg)
|
||||||
# For 400 errors, include more specific information
|
# For 400 errors, include more specific information
|
||||||
if response.status_code == 400:
|
if response.status_code == 400:
|
||||||
try:
|
try:
|
||||||
error_data = response.json()
|
error_data = response.json()
|
||||||
if 'error' in error_data:
|
if 'error' in error_data:
|
||||||
error_msg = f"Mineru API error: {error_data['error']}"
|
error_msg = f"MagicDoc API error: {error_data['error']}"
|
||||||
except:
|
except:
|
||||||
pass
|
pass
|
||||||
raise Exception(error_msg)
|
raise Exception(error_msg)
|
||||||
|
|
||||||
except requests.exceptions.Timeout:
|
except requests.exceptions.Timeout:
|
||||||
error_msg = f"Mineru API request timed out after {self.mineru_timeout} seconds"
|
error_msg = f"MagicDoc API request timed out after {self.magicdoc_timeout} seconds"
|
||||||
logger.error(error_msg)
|
logger.error(error_msg)
|
||||||
raise Exception(error_msg)
|
raise Exception(error_msg)
|
||||||
except requests.exceptions.RequestException as e:
|
except requests.exceptions.RequestException as e:
|
||||||
error_msg = f"Error calling Mineru API for DOCX: {str(e)}"
|
error_msg = f"Error calling MagicDoc API for DOCX: {str(e)}"
|
||||||
logger.error(error_msg)
|
logger.error(error_msg)
|
||||||
raise Exception(error_msg)
|
raise Exception(error_msg)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
error_msg = f"Unexpected error calling Mineru API for DOCX: {str(e)}"
|
error_msg = f"Unexpected error calling MagicDoc API for DOCX: {str(e)}"
|
||||||
logger.error(error_msg)
|
logger.error(error_msg)
|
||||||
raise Exception(error_msg)
|
raise Exception(error_msg)
|
||||||
|
|
||||||
def _extract_markdown_from_response(self, response: Dict[str, Any]) -> str:
|
def _extract_markdown_from_response(self, response: Dict[str, Any]) -> str:
|
||||||
"""
|
"""
|
||||||
Extract markdown content from Mineru API response
|
Extract markdown content from MagicDoc API response
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
response: Mineru API response dictionary
|
response: MagicDoc API response dictionary
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
Extracted markdown content as string
|
Extracted markdown content as string
|
||||||
"""
|
"""
|
||||||
try:
|
try:
|
||||||
logger.debug(f"Mineru API response structure for DOCX: {response}")
|
logger.debug(f"MagicDoc API response structure for DOCX: {response}")
|
||||||
|
|
||||||
# Try different possible response formats based on Mineru API
|
# Try different possible response formats based on MagicDoc API
|
||||||
if 'markdown' in response:
|
if 'markdown' in response:
|
||||||
return response['markdown']
|
return response['markdown']
|
||||||
elif 'md' in response:
|
elif 'md' in response:
|
||||||
|
|
@ -157,7 +158,7 @@ class DocxDocumentProcessor(DocumentProcessor):
|
||||||
return first_item
|
return first_item
|
||||||
else:
|
else:
|
||||||
# If no standard format found, try to extract from the response structure
|
# If no standard format found, try to extract from the response structure
|
||||||
logger.warning("Could not find standard markdown field in Mineru response for DOCX")
|
logger.warning("Could not find standard markdown field in MagicDoc response for DOCX")
|
||||||
|
|
||||||
# Return the response as string if it's simple, or empty string
|
# Return the response as string if it's simple, or empty string
|
||||||
if isinstance(response, str):
|
if isinstance(response, str):
|
||||||
|
|
@ -176,21 +177,21 @@ class DocxDocumentProcessor(DocumentProcessor):
|
||||||
return ""
|
return ""
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(f"Error extracting markdown from Mineru response for DOCX: {str(e)}")
|
logger.error(f"Error extracting markdown from MagicDoc response for DOCX: {str(e)}")
|
||||||
return ""
|
return ""
|
||||||
|
|
||||||
def read_content(self) -> str:
|
def read_content(self) -> str:
|
||||||
logger.info("Starting DOCX content processing with Mineru API")
|
logger.info("Starting DOCX content processing with MagicDoc API")
|
||||||
|
|
||||||
# Call Mineru API to convert DOCX to markdown
|
# Call MagicDoc API to convert DOCX to markdown
|
||||||
# This will raise an exception if the API call fails
|
# This will raise an exception if the API call fails
|
||||||
mineru_response = self._call_mineru_api(self.input_path)
|
magicdoc_response = self._call_magicdoc_api(self.input_path)
|
||||||
|
|
||||||
# Extract markdown content from the response
|
# Extract markdown content from the response
|
||||||
markdown_content = self._extract_markdown_from_response(mineru_response)
|
markdown_content = self._extract_markdown_from_response(magicdoc_response)
|
||||||
|
|
||||||
if not markdown_content:
|
if not markdown_content:
|
||||||
raise Exception("No markdown content found in Mineru API response for DOCX")
|
raise Exception("No markdown content found in MagicDoc API response for DOCX")
|
||||||
|
|
||||||
logger.info(f"Successfully extracted {len(markdown_content)} characters of markdown content from DOCX")
|
logger.info(f"Successfully extracted {len(markdown_content)} characters of markdown content from DOCX")
|
||||||
|
|
||||||
|
|
|
||||||
127
backend/log
127
backend/log
|
|
@ -1,127 +0,0 @@
|
||||||
[2025-07-14 14:20:19,015: INFO/ForkPoolWorker-4] Raw response from LLM: {
|
|
||||||
celery_worker-1 | "entities": []
|
|
||||||
celery_worker-1 | }
|
|
||||||
celery_worker-1 | [2025-07-14 14:20:19,016: INFO/ForkPoolWorker-4] Parsed mapping: {'entities': []}
|
|
||||||
celery_worker-1 | [2025-07-14 14:20:19,020: INFO/ForkPoolWorker-4] Calling ollama to generate case numbers mapping for chunk (attempt 1/3):
|
|
||||||
celery_worker-1 | 你是一个专业的法律文本实体识别助手。请从以下文本中抽取出所有需要脱敏的敏感信息,并按照指定的类别进行分类。请严格按照JSON格式输出结果。
|
|
||||||
celery_worker-1 |
|
|
||||||
celery_worker-1 | 实体类别包括:
|
|
||||||
celery_worker-1 | - 案号
|
|
||||||
celery_worker-1 |
|
|
||||||
celery_worker-1 | 待处理文本:
|
|
||||||
celery_worker-1 |
|
|
||||||
celery_worker-1 |
|
|
||||||
celery_worker-1 | 二审案件受理费450892 元,由北京丰复久信营销科技有限公司负担(已交纳)。
|
|
||||||
celery_worker-1 |
|
|
||||||
celery_worker-1 | 29. 本判决为终审判决。
|
|
||||||
celery_worker-1 |
|
|
||||||
celery_worker-1 | 审 判 长 史晓霞审 判 员 邓青菁审 判 员 李 淼二〇二二年七月七日法 官 助 理 黎 铧书 记 员 郑海兴
|
|
||||||
celery_worker-1 |
|
|
||||||
celery_worker-1 | 输出格式:
|
|
||||||
celery_worker-1 | {
|
|
||||||
celery_worker-1 | "entities": [
|
|
||||||
celery_worker-1 | {"text": "原始文本内容", "type": "案号"},
|
|
||||||
celery_worker-1 | ...
|
|
||||||
celery_worker-1 | ]
|
|
||||||
celery_worker-1 | }
|
|
||||||
celery_worker-1 |
|
|
||||||
celery_worker-1 | 请严格按照JSON格式输出结果。
|
|
||||||
celery_worker-1 |
|
|
||||||
api-1 | INFO: 192.168.65.1:60045 - "GET /api/v1/files/files HTTP/1.1" 200 OK
|
|
||||||
api-1 | INFO: 192.168.65.1:34054 - "GET /api/v1/files/files HTTP/1.1" 200 OK
|
|
||||||
api-1 | INFO: 192.168.65.1:34054 - "GET /api/v1/files/files HTTP/1.1" 200 OK
|
|
||||||
api-1 | INFO: 192.168.65.1:22084 - "GET /api/v1/files/files HTTP/1.1" 200 OK
|
|
||||||
celery_worker-1 | [2025-07-14 14:20:31,279: INFO/ForkPoolWorker-4] Raw response from LLM: {
|
|
||||||
celery_worker-1 | "entities": []
|
|
||||||
celery_worker-1 | }
|
|
||||||
celery_worker-1 | [2025-07-14 14:20:31,281: INFO/ForkPoolWorker-4] Parsed mapping: {'entities': []}
|
|
||||||
celery_worker-1 | [2025-07-14 14:20:31,287: INFO/ForkPoolWorker-4] Chunk mapping: [{'entities': []}, {'entities': [{'text': '北京丰复久信营销科技有限公司', 'type': '公司名称'}]}, {'entities': []}, {'entities': []}, {'entities': []}]
|
|
||||||
celery_worker-1 | [2025-07-14 14:20:31,288: INFO/ForkPoolWorker-4] Final chunk mappings: [{'entities': [{'text': '郭东军', 'type': '人名'}, {'text': '王欢子', 'type': '人名'}]}, {'entities': [{'text': '北京丰复久信营销科技有限公司', 'type': '公司名称'}, {'text': '丰复久信公司', 'type': '公司名称简称'}, {'text': '中研智创区块链技术有限公司', 'type': '公司名称'}, {'text': '中研智才公司', 'type': '公司名称简称'}]}, {'entities': [{'text': '北京市海淀区北小马厂6 号1 号楼华天大厦1306 室', 'type': '地址'}, {'text': '天津市津南区双港镇工业园区优谷产业园5 号楼-1505', 'type': '地址'}]}, {'entities': [{'text': '服务合同', 'type': '项目名'}]}, {'entities': [{'text': '(2022)京 03 民终 3852 号', 'type': '案号'}, {'text': '(2020)京0105 民初69754 号', 'type': '案号'}]}, {'entities': [{'text': '李圣艳', 'type': '人名'}, {'text': '闫向东', 'type': '人名'}, {'text': '李敏', 'type': '人名'}, {'text': '布兰登·斯密特', 'type': '英文人名'}]}, {'entities': [{'text': '丰复久信公司', 'type': '公司名称'}, {'text': '中研智创公司', 'type': '公司名称'}, {'text': '丰复久信', 'type': '公司名称简称'}, {'text': '中研智创', 'type': '公司名称简称'}]}, {'entities': [{'text': '上海市', 'type': '地址'}, {'text': '北京', 'type': '地址'}]}, {'entities': [{'text': '《计算机设备采购合同》', 'type': '项目名'}]}, {'entities': []}, {'entities': []}, {'entities': [{'text': '丰复久信公司', 'type': '公司名称'}, {'text': '中研智创公司', 'type': '公司名称'}]}, {'entities': []}, {'entities': [{'text': '《服务合同书》', 'type': '项目名'}]}, {'entities': []}, {'entities': []}, {'entities': [{'text': '北京丰复久信营销科技有限公司', 'type': '公司名称'}]}, {'entities': []}, {'entities': []}, {'entities': []}]
|
|
||||||
celery_worker-1 | [2025-07-14 14:20:31,288: INFO/ForkPoolWorker-4] Duplicate entity found: {'text': '丰复久信公司', 'type': '公司名称'}
|
|
||||||
celery_worker-1 | [2025-07-14 14:20:31,288: INFO/ForkPoolWorker-4] Duplicate entity found: {'text': '丰复久信公司', 'type': '公司名称'}
|
|
||||||
celery_worker-1 | [2025-07-14 14:20:31,288: INFO/ForkPoolWorker-4] Duplicate entity found: {'text': '中研智创公司', 'type': '公司名称'}
|
|
||||||
celery_worker-1 | [2025-07-14 14:20:31,288: INFO/ForkPoolWorker-4] Duplicate entity found: {'text': '北京丰复久信营销科技有限公司', 'type': '公司名称'}
|
|
||||||
celery_worker-1 | [2025-07-14 14:20:31,288: INFO/ForkPoolWorker-4] Merged 22 unique entities
|
|
||||||
celery_worker-1 | [2025-07-14 14:20:31,288: INFO/ForkPoolWorker-4] Unique entities: [{'text': '郭东军', 'type': '人名'}, {'text': '王欢子', 'type': '人名'}, {'text': '北京丰复久信营销科技有限公司', 'type': '公司名称'}, {'text': '丰复久信公司', 'type': '公司名称简称'}, {'text': '中研智创区块链技术有限公司', 'type': '公司名称'}, {'text': '中研智才公司', 'type': '公司名称简称'}, {'text': '北京市海淀区北小马厂6 号1 号楼华天大厦1306 室', 'type': '地址'}, {'text': '天津市津南区双港镇工业园区优谷产业园5 号楼-1505', 'type': '地址'}, {'text': '服务合同', 'type': '项目名'}, {'text': '(2022)京 03 民终 3852 号', 'type': '案号'}, {'text': '(2020)京0105 民初69754 号', 'type': '案号'}, {'text': '李圣艳', 'type': '人名'}, {'text': '闫向东', 'type': '人名'}, {'text': '李敏', 'type': '人名'}, {'text': '布兰登·斯密特', 'type': '英文人名'}, {'text': '中研智创公司', 'type': '公司名称'}, {'text': '丰复久信', 'type': '公司名称简称'}, {'text': '中研智创', 'type': '公司名称简称'}, {'text': '上海市', 'type': '地址'}, {'text': '北京', 'type': '地址'}, {'text': '《计算机设备采购合同》', 'type': '项目名'}, {'text': '《服务合同书》', 'type': '项目名'}]
|
|
||||||
celery_worker-1 | [2025-07-14 14:20:31,289: INFO/ForkPoolWorker-4] Calling ollama to generate entity linkage (attempt 1/3)
|
|
||||||
api-1 | INFO: 192.168.65.1:52168 - "GET /api/v1/files/files HTTP/1.1" 200 OK
|
|
||||||
api-1 | INFO: 192.168.65.1:61426 - "GET /api/v1/files/files HTTP/1.1" 200 OK
|
|
||||||
api-1 | INFO: 192.168.65.1:30702 - "GET /api/v1/files/files HTTP/1.1" 200 OK
|
|
||||||
api-1 | INFO: 192.168.65.1:48159 - "GET /api/v1/files/files HTTP/1.1" 200 OK
|
|
||||||
api-1 | INFO: 192.168.65.1:16860 - "GET /api/v1/files/files HTTP/1.1" 200 OK
|
|
||||||
api-1 | INFO: 192.168.65.1:21262 - "GET /api/v1/files/files HTTP/1.1" 200 OK
|
|
||||||
api-1 | INFO: 192.168.65.1:45564 - "GET /api/v1/files/files HTTP/1.1" 200 OK
|
|
||||||
api-1 | INFO: 192.168.65.1:32142 - "GET /api/v1/files/files HTTP/1.1" 200 OK
|
|
||||||
api-1 | INFO: 192.168.65.1:27769 - "GET /api/v1/files/files HTTP/1.1" 200 OK
|
|
||||||
api-1 | INFO: 192.168.65.1:21196 - "GET /api/v1/files/files HTTP/1.1" 200 OK
|
|
||||||
celery_worker-1 | [2025-07-14 14:21:21,436: INFO/ForkPoolWorker-4] Raw entity linkage response from LLM: {
|
|
||||||
celery_worker-1 | "entity_groups": [
|
|
||||||
celery_worker-1 | {
|
|
||||||
celery_worker-1 | "group_id": "group_1",
|
|
||||||
celery_worker-1 | "group_type": "公司名称",
|
|
||||||
celery_worker-1 | "entities": [
|
|
||||||
celery_worker-1 | {
|
|
||||||
celery_worker-1 | "text": "北京丰复久信营销科技有限公司",
|
|
||||||
celery_worker-1 | "type": "公司名称",
|
|
||||||
celery_worker-1 | "is_primary": true
|
|
||||||
celery_worker-1 | },
|
|
||||||
celery_worker-1 | {
|
|
||||||
celery_worker-1 | "text": "丰复久信公司",
|
|
||||||
celery_worker-1 | "type": "公司名称简称",
|
|
||||||
celery_worker-1 | "is_primary": false
|
|
||||||
celery_worker-1 | },
|
|
||||||
celery_worker-1 | {
|
|
||||||
celery_worker-1 | "text": "丰复久信",
|
|
||||||
celery_worker-1 | "type": "公司名称简称",
|
|
||||||
celery_worker-1 | "is_primary": false
|
|
||||||
celery_worker-1 | }
|
|
||||||
celery_worker-1 | ]
|
|
||||||
celery_worker-1 | },
|
|
||||||
celery_worker-1 | {
|
|
||||||
celery_worker-1 | "group_id": "group_2",
|
|
||||||
celery_worker-1 | "group_type": "公司名称",
|
|
||||||
celery_worker-1 | "entities": [
|
|
||||||
celery_worker-1 | {
|
|
||||||
celery_worker-1 | "text": "中研智创区块链技术有限公司",
|
|
||||||
celery_worker-1 | "type": "公司名称",
|
|
||||||
celery_worker-1 | "is_primary": true
|
|
||||||
celery_worker-1 | },
|
|
||||||
celery_worker-1 | {
|
|
||||||
celery_worker-1 | "text": "中研智创公司",
|
|
||||||
celery_worker-1 | "type": "公司名称简称",
|
|
||||||
celery_worker-1 | "is_primary": false
|
|
||||||
celery_worker-1 | },
|
|
||||||
celery_worker-1 | {
|
|
||||||
celery_worker-1 | "text": "中研智创",
|
|
||||||
celery_worker-1 | "type": "公司名称简称",
|
|
||||||
celery_worker-1 | "is_primary": false
|
|
||||||
celery_worker-1 | }
|
|
||||||
celery_worker-1 | ]
|
|
||||||
celery_worker-1 | }
|
|
||||||
celery_worker-1 | ]
|
|
||||||
celery_worker-1 | }
|
|
||||||
celery_worker-1 | [2025-07-14 14:21:21,437: INFO/ForkPoolWorker-4] Parsed entity linkage: {'entity_groups': [{'group_id': 'group_1', 'group_type': '公司名称', 'entities': [{'text': '北京丰复久信营销科技有限公司', 'type': '公司名称', 'is_primary': True}, {'text': '丰复久信公司', 'type': '公司名称简称', 'is_primary': False}, {'text': '丰复久信', 'type': '公司名称简称', 'is_primary': False}]}, {'group_id': 'group_2', 'group_type': '公司名称', 'entities': [{'text': '中研智创区块链技术有限公司', 'type': '公司名称', 'is_primary': True}, {'text': '中研智创公司', 'type': '公司名称简称', 'is_primary': False}, {'text': '中研智创', 'type': '公司名称简称', 'is_primary': False}]}]}
|
|
||||||
celery_worker-1 | [2025-07-14 14:21:21,445: INFO/ForkPoolWorker-4] Successfully created entity linkage with 2 groups
|
|
||||||
celery_worker-1 | [2025-07-14 14:21:21,445: INFO/ForkPoolWorker-4] Entity linkage: {'entity_groups': [{'group_id': 'group_1', 'group_type': '公司名称', 'entities': [{'text': '北京丰复久信营销科技有限公司', 'type': '公司名称', 'is_primary': True}, {'text': '丰复久信公司', 'type': '公司名称简称', 'is_primary': False}, {'text': '丰复久信', 'type': '公司名称简称', 'is_primary': False}]}, {'group_id': 'group_2', 'group_type': '公司名称', 'entities': [{'text': '中研智创区块链技术有限公司', 'type': '公司名称', 'is_primary': True}, {'text': '中研智创公司', 'type': '公司名称简称', 'is_primary': False}, {'text': '中研智创', 'type': '公司名称简称', 'is_primary': False}]}]}
|
|
||||||
celery_worker-1 | [2025-07-14 14:21:21,446: INFO/ForkPoolWorker-4] Generated masked mapping for 22 entities
|
|
||||||
celery_worker-1 | [2025-07-14 14:21:21,446: INFO/ForkPoolWorker-4] Combined mapping: {'郭东军': '某', '王欢子': '某甲', '北京丰复久信营销科技有限公司': '某公司', '丰复久信公司': '某公司甲', '中研智创区块链技术有限公司': '某公司乙', '中研智才公司': '某公司丙', '北京市海淀区北小马厂6 号1 号楼华天大厦1306 室': '某乙', '天津市津南区双港镇工业园区优谷产业园5 号楼-1505': '某丙', '服务合同': '某丁', '(2022)京 03 民终 3852 号': '某戊', '(2020)京0105 民初69754 号': '某己', '李圣艳': '某庚', '闫向东': '某辛', '李敏': '某壬', '布兰登·斯密特': '某癸', '中研智创公司': '某公司丁', '丰复久信': '某公司戊', '中研智创': '某公司己', '上海市': '某11', '北京': '某12', '《计算机设备采购合同》': '某13', '《服务合同书》': '某14'}
|
|
||||||
celery_worker-1 | [2025-07-14 14:21:21,446: INFO/ForkPoolWorker-4] Linked entity '北京丰复久信营销科技有限公司' to '北京丰复久信营销科技有限公司' with masked name '某公司'
|
|
||||||
celery_worker-1 | [2025-07-14 14:21:21,446: INFO/ForkPoolWorker-4] Linked entity '丰复久信公司' to '北京丰复久信营销科技有限公司' with masked name '某公司'
|
|
||||||
celery_worker-1 | [2025-07-14 14:21:21,446: INFO/ForkPoolWorker-4] Linked entity '丰复久信' to '北京丰复久信营销科技有限公司' with masked name '某公司'
|
|
||||||
celery_worker-1 | [2025-07-14 14:21:21,446: INFO/ForkPoolWorker-4] Linked entity '中研智创区块链技术有限公司' to '中研智创区块链技术有限公司' with masked name '某公司乙'
|
|
||||||
celery_worker-1 | [2025-07-14 14:21:21,446: INFO/ForkPoolWorker-4] Linked entity '中研智创公司' to '中研智创区块链技术有限公司' with masked name '某公司乙'
|
|
||||||
celery_worker-1 | [2025-07-14 14:21:21,446: INFO/ForkPoolWorker-4] Linked entity '中研智创' to '中研智创区块链技术有限公司' with masked name '某公司乙'
|
|
||||||
celery_worker-1 | [2025-07-14 14:21:21,446: INFO/ForkPoolWorker-4] Final mapping: {'郭东军': '某', '王欢子': '某甲', '北京丰复久信营销科技有限公司': '某公司', '丰复久信公司': '某公司', '中研智创区块链技术有限公司': '某公司乙', '中研智才公司': '某公司丙', '北京市海淀区北小马厂6 号1 号楼华天大厦1306 室': '某乙', '天津市津南区双港镇工业园区优谷产业园5 号楼-1505': '某丙', '服务合同': '某丁', '(2022)京 03 民终 3852 号': '某戊', '(2020)京0105 民初69754 号': '某己', '李圣艳': '某庚', '闫向东': '某辛', '李敏': '某壬', '布兰登·斯密特': '某癸', '中研智创公司': '某公司乙', '丰复久信': '某公司', '中研智创': '某公司乙', '上海市': '某11', '北京': '某12', '《计算机设备采购合同》': '某13', '《服务合同书》': '某14'}
|
|
||||||
celery_worker-1 | [2025-07-14 14:21:21,446: INFO/ForkPoolWorker-4] Successfully masked content
|
|
||||||
celery_worker-1 | [2025-07-14 14:21:21,449: INFO/ForkPoolWorker-4] Successfully saved masked content to /app/storage/processed/47522ea9-c259-4304-bfe4-1d3ed6902ede.md
|
|
||||||
celery_worker-1 | [2025-07-14 14:21:21,470: INFO/ForkPoolWorker-4] Task app.services.file_service.process_file[5cfbca4c-0f6f-4c71-a66b-b22ee2d28139] succeeded in 311.847165101s: None
|
|
||||||
api-1 | INFO: 192.168.65.1:33432 - "GET /api/v1/files/files HTTP/1.1" 200 OK
|
|
||||||
api-1 | INFO: 192.168.65.1:40073 - "GET /api/v1/files/files HTTP/1.1" 200 OK
|
|
||||||
api-1 | INFO: 192.168.65.1:29550 - "GET /api/v1/files/files HTTP/1.1" 200 OK
|
|
||||||
api-1 | INFO: 192.168.65.1:61350 - "GET /api/v1/files/files HTTP/1.1" 200 OK
|
|
||||||
api-1 | INFO: 192.168.65.1:61755 - "GET /api/v1/files/files HTTP/1.1" 200 OK
|
|
||||||
api-1 | INFO: 192.168.65.1:63726 - "GET /api/v1/files/files HTTP/1.1" 200 OK
|
|
||||||
api-1 | INFO: 192.168.65.1:43446 - "GET /api/v1/files/files HTTP/1.1" 200 OK
|
|
||||||
api-1 | INFO: 192.168.65.1:45624 - "GET /api/v1/files/files HTTP/1.1" 200 OK
|
|
||||||
api-1 | INFO: 192.168.65.1:25256 - "GET /api/v1/files/files HTTP/1.1" 200 OK
|
|
||||||
api-1 | INFO: 192.168.65.1:43464 - "GET /api/v1/files/files HTTP/1.1" 200 OK
|
|
||||||
|
|
@ -1,32 +0,0 @@
|
||||||
#!/usr/bin/env python3
|
|
||||||
"""
|
|
||||||
Simple test runner script to verify test discovery and execution
|
|
||||||
"""
|
|
||||||
import subprocess
|
|
||||||
import sys
|
|
||||||
import os
|
|
||||||
from pathlib import Path
|
|
||||||
|
|
||||||
def run_tests():
|
|
||||||
"""Run pytest with proper configuration"""
|
|
||||||
# Change to backend directory
|
|
||||||
backend_dir = Path(__file__).parent
|
|
||||||
os.chdir(backend_dir)
|
|
||||||
|
|
||||||
# Run pytest
|
|
||||||
cmd = [sys.executable, "-m", "pytest", "tests/", "-v", "--tb=short"]
|
|
||||||
|
|
||||||
print(f"Running tests from: {backend_dir}")
|
|
||||||
print(f"Command: {' '.join(cmd)}")
|
|
||||||
print("-" * 50)
|
|
||||||
|
|
||||||
try:
|
|
||||||
result = subprocess.run(cmd, capture_output=False, text=True)
|
|
||||||
return result.returncode
|
|
||||||
except Exception as e:
|
|
||||||
print(f"Error running tests: {e}")
|
|
||||||
return 1
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
exit_code = run_tests()
|
|
||||||
sys.exit(exit_code)
|
|
||||||
|
|
@ -25,6 +25,29 @@ services:
|
||||||
networks:
|
networks:
|
||||||
- app-network
|
- app-network
|
||||||
|
|
||||||
|
# MagicDoc API Service
|
||||||
|
magicdoc-api:
|
||||||
|
build:
|
||||||
|
context: ./magicdoc
|
||||||
|
dockerfile: Dockerfile
|
||||||
|
platform: linux/amd64
|
||||||
|
ports:
|
||||||
|
- "8002:8000"
|
||||||
|
volumes:
|
||||||
|
- ./magicdoc/storage/uploads:/app/storage/uploads
|
||||||
|
- ./magicdoc/storage/processed:/app/storage/processed
|
||||||
|
environment:
|
||||||
|
- PYTHONUNBUFFERED=1
|
||||||
|
restart: unless-stopped
|
||||||
|
healthcheck:
|
||||||
|
test: ["CMD", "curl", "-f", "http://localhost:8000/health"]
|
||||||
|
interval: 30s
|
||||||
|
timeout: 10s
|
||||||
|
retries: 3
|
||||||
|
start_period: 60s
|
||||||
|
networks:
|
||||||
|
- app-network
|
||||||
|
|
||||||
# Backend API Service
|
# Backend API Service
|
||||||
backend-api:
|
backend-api:
|
||||||
build:
|
build:
|
||||||
|
|
@ -40,9 +63,11 @@ services:
|
||||||
- CELERY_BROKER_URL=redis://redis:6379/0
|
- CELERY_BROKER_URL=redis://redis:6379/0
|
||||||
- CELERY_RESULT_BACKEND=redis://redis:6379/0
|
- CELERY_RESULT_BACKEND=redis://redis:6379/0
|
||||||
- MINERU_API_URL=http://mineru-api:8000
|
- MINERU_API_URL=http://mineru-api:8000
|
||||||
|
- MAGICDOC_API_URL=http://magicdoc-api:8000
|
||||||
depends_on:
|
depends_on:
|
||||||
- redis
|
- redis
|
||||||
- mineru-api
|
- mineru-api
|
||||||
|
- magicdoc-api
|
||||||
networks:
|
networks:
|
||||||
- app-network
|
- app-network
|
||||||
|
|
||||||
|
|
@ -60,6 +85,7 @@ services:
|
||||||
- CELERY_BROKER_URL=redis://redis:6379/0
|
- CELERY_BROKER_URL=redis://redis:6379/0
|
||||||
- CELERY_RESULT_BACKEND=redis://redis:6379/0
|
- CELERY_RESULT_BACKEND=redis://redis:6379/0
|
||||||
- MINERU_API_URL=http://mineru-api:8000
|
- MINERU_API_URL=http://mineru-api:8000
|
||||||
|
- MAGICDOC_API_URL=http://magicdoc-api:8000
|
||||||
depends_on:
|
depends_on:
|
||||||
- redis
|
- redis
|
||||||
- backend-api
|
- backend-api
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,38 @@
|
||||||
|
FROM python:3.10-slim
|
||||||
|
|
||||||
|
WORKDIR /app
|
||||||
|
|
||||||
|
# Install system dependencies including LibreOffice
|
||||||
|
RUN apt-get update && apt-get install -y \
|
||||||
|
build-essential \
|
||||||
|
libreoffice \
|
||||||
|
libreoffice-writer \
|
||||||
|
libreoffice-calc \
|
||||||
|
libreoffice-impress \
|
||||||
|
wget \
|
||||||
|
curl \
|
||||||
|
&& rm -rf /var/lib/apt/lists/*
|
||||||
|
|
||||||
|
# Copy requirements and install Python packages first
|
||||||
|
COPY requirements.txt .
|
||||||
|
RUN pip install --upgrade pip
|
||||||
|
RUN pip install --no-cache-dir -r requirements.txt
|
||||||
|
|
||||||
|
# Install fairy-doc after numpy and opencv are installed
|
||||||
|
RUN pip install --no-cache-dir "fairy-doc[cpu]"
|
||||||
|
|
||||||
|
# Copy the application code
|
||||||
|
COPY app/ ./app/
|
||||||
|
|
||||||
|
# Create storage directories
|
||||||
|
RUN mkdir -p storage/uploads storage/processed
|
||||||
|
|
||||||
|
# Expose the port the app runs on
|
||||||
|
EXPOSE 8000
|
||||||
|
|
||||||
|
# Health check
|
||||||
|
HEALTHCHECK --interval=30s --timeout=10s --start-period=60s --retries=3 \
|
||||||
|
CMD curl -f http://localhost:8000/health || exit 1
|
||||||
|
|
||||||
|
# Command to run the application
|
||||||
|
CMD ["uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "8000"]
|
||||||
|
|
@ -0,0 +1,94 @@
|
||||||
|
# MagicDoc API Service
|
||||||
|
|
||||||
|
A FastAPI service that provides document to markdown conversion using the Magic-Doc library. This service is designed to be compatible with the existing Mineru API interface.
|
||||||
|
|
||||||
|
## Features
|
||||||
|
|
||||||
|
- Converts DOC, DOCX, PPT, PPTX, and PDF files to markdown
|
||||||
|
- RESTful API interface compatible with Mineru API
|
||||||
|
- Docker containerization with LibreOffice dependencies
|
||||||
|
- Health check endpoint
|
||||||
|
- File upload support
|
||||||
|
|
||||||
|
## API Endpoints
|
||||||
|
|
||||||
|
### Health Check
|
||||||
|
```
|
||||||
|
GET /health
|
||||||
|
```
|
||||||
|
Returns service health status.
|
||||||
|
|
||||||
|
### File Parse
|
||||||
|
```
|
||||||
|
POST /file_parse
|
||||||
|
```
|
||||||
|
Converts uploaded document to markdown.
|
||||||
|
|
||||||
|
**Parameters:**
|
||||||
|
- `files`: File upload (required)
|
||||||
|
- `output_dir`: Output directory (default: "./output")
|
||||||
|
- `lang_list`: Language list (default: "ch")
|
||||||
|
- `backend`: Backend type (default: "pipeline")
|
||||||
|
- `parse_method`: Parse method (default: "auto")
|
||||||
|
- `formula_enable`: Enable formula processing (default: true)
|
||||||
|
- `table_enable`: Enable table processing (default: true)
|
||||||
|
- `return_md`: Return markdown (default: true)
|
||||||
|
- `return_middle_json`: Return middle JSON (default: false)
|
||||||
|
- `return_model_output`: Return model output (default: false)
|
||||||
|
- `return_content_list`: Return content list (default: false)
|
||||||
|
- `return_images`: Return images (default: false)
|
||||||
|
- `start_page_id`: Start page ID (default: 0)
|
||||||
|
- `end_page_id`: End page ID (default: 99999)
|
||||||
|
|
||||||
|
**Response:**
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"markdown": "converted markdown content",
|
||||||
|
"md": "converted markdown content",
|
||||||
|
"content": "converted markdown content",
|
||||||
|
"text": "converted markdown content",
|
||||||
|
"time_cost": 1.23,
|
||||||
|
"filename": "document.docx",
|
||||||
|
"status": "success"
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
## Running with Docker
|
||||||
|
|
||||||
|
### Build and run with docker-compose
|
||||||
|
```bash
|
||||||
|
cd magicdoc
|
||||||
|
docker-compose up --build
|
||||||
|
```
|
||||||
|
|
||||||
|
The service will be available at `http://localhost:8002`
|
||||||
|
|
||||||
|
### Build and run with Docker
|
||||||
|
```bash
|
||||||
|
cd magicdoc
|
||||||
|
docker build -t magicdoc-api .
|
||||||
|
docker run -p 8002:8000 magicdoc-api
|
||||||
|
```
|
||||||
|
|
||||||
|
## Integration with Document Processors
|
||||||
|
|
||||||
|
This service is designed to be compatible with the existing document processors. To use it instead of Mineru API, update the configuration in your document processors:
|
||||||
|
|
||||||
|
```python
|
||||||
|
# In docx_processor.py or pdf_processor.py
|
||||||
|
self.magicdoc_base_url = getattr(settings, 'MAGICDOC_API_URL', 'http://magicdoc-api:8000')
|
||||||
|
```
|
||||||
|
|
||||||
|
## Dependencies
|
||||||
|
|
||||||
|
- Python 3.10
|
||||||
|
- LibreOffice (installed in Docker container)
|
||||||
|
- Magic-Doc library
|
||||||
|
- FastAPI
|
||||||
|
- Uvicorn
|
||||||
|
|
||||||
|
## Storage
|
||||||
|
|
||||||
|
The service creates the following directories:
|
||||||
|
- `storage/uploads/`: For uploaded files
|
||||||
|
- `storage/processed/`: For processed files
|
||||||
|
|
@ -0,0 +1,152 @@
|
||||||
|
# MagicDoc Service Setup Guide
|
||||||
|
|
||||||
|
This guide explains how to set up and use the MagicDoc API service as an alternative to the Mineru API for document processing.
|
||||||
|
|
||||||
|
## Overview
|
||||||
|
|
||||||
|
The MagicDoc service provides a FastAPI-based REST API that converts various document formats (DOC, DOCX, PPT, PPTX, PDF) to markdown using the Magic-Doc library. It's designed to be compatible with your existing document processors.
|
||||||
|
|
||||||
|
## Quick Start
|
||||||
|
|
||||||
|
### 1. Build and Run the Service
|
||||||
|
|
||||||
|
```bash
|
||||||
|
cd magicdoc
|
||||||
|
./start.sh
|
||||||
|
```
|
||||||
|
|
||||||
|
Or manually:
|
||||||
|
```bash
|
||||||
|
cd magicdoc
|
||||||
|
docker-compose up --build -d
|
||||||
|
```
|
||||||
|
|
||||||
|
### 2. Verify the Service
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Check health
|
||||||
|
curl http://localhost:8002/health
|
||||||
|
|
||||||
|
# View API documentation
|
||||||
|
open http://localhost:8002/docs
|
||||||
|
```
|
||||||
|
|
||||||
|
### 3. Test with Sample Files
|
||||||
|
|
||||||
|
```bash
|
||||||
|
cd magicdoc
|
||||||
|
python test_api.py
|
||||||
|
```
|
||||||
|
|
||||||
|
## API Compatibility
|
||||||
|
|
||||||
|
The MagicDoc API is designed to be compatible with your existing Mineru API interface:
|
||||||
|
|
||||||
|
### Endpoint: `POST /file_parse`
|
||||||
|
|
||||||
|
**Request Format:**
|
||||||
|
- File upload via multipart form data
|
||||||
|
- Same parameters as Mineru API (most are optional)
|
||||||
|
|
||||||
|
**Response Format:**
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"markdown": "converted content",
|
||||||
|
"md": "converted content",
|
||||||
|
"content": "converted content",
|
||||||
|
"text": "converted content",
|
||||||
|
"time_cost": 1.23,
|
||||||
|
"filename": "document.docx",
|
||||||
|
"status": "success"
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
## Integration with Existing Processors
|
||||||
|
|
||||||
|
To use MagicDoc instead of Mineru in your existing processors:
|
||||||
|
|
||||||
|
### 1. Update Configuration
|
||||||
|
|
||||||
|
Add to your settings:
|
||||||
|
```python
|
||||||
|
MAGICDOC_API_URL = "http://magicdoc-api:8000" # or http://localhost:8002
|
||||||
|
MAGICDOC_TIMEOUT = 300
|
||||||
|
```
|
||||||
|
|
||||||
|
### 2. Modify Processors
|
||||||
|
|
||||||
|
Replace Mineru API calls with MagicDoc API calls. See `integration_example.py` for detailed examples.
|
||||||
|
|
||||||
|
### 3. Update Docker Compose
|
||||||
|
|
||||||
|
Add the MagicDoc service to your main docker-compose.yml:
|
||||||
|
```yaml
|
||||||
|
services:
|
||||||
|
magicdoc-api:
|
||||||
|
build:
|
||||||
|
context: ./magicdoc
|
||||||
|
dockerfile: Dockerfile
|
||||||
|
ports:
|
||||||
|
- "8002:8000"
|
||||||
|
volumes:
|
||||||
|
- ./magicdoc/storage:/app/storage
|
||||||
|
environment:
|
||||||
|
- PYTHONUNBUFFERED=1
|
||||||
|
restart: unless-stopped
|
||||||
|
```
|
||||||
|
|
||||||
|
## Service Architecture
|
||||||
|
|
||||||
|
```
|
||||||
|
magicdoc/
|
||||||
|
├── app/
|
||||||
|
│ ├── __init__.py
|
||||||
|
│ └── main.py # FastAPI application
|
||||||
|
├── Dockerfile # Container definition
|
||||||
|
├── docker-compose.yml # Service orchestration
|
||||||
|
├── requirements.txt # Python dependencies
|
||||||
|
├── README.md # Service documentation
|
||||||
|
├── SETUP.md # This setup guide
|
||||||
|
├── test_api.py # API testing script
|
||||||
|
├── integration_example.py # Integration examples
|
||||||
|
└── start.sh # Startup script
|
||||||
|
```
|
||||||
|
|
||||||
|
## Dependencies
|
||||||
|
|
||||||
|
- **Python 3.10**: Base runtime
|
||||||
|
- **LibreOffice**: Document processing (installed in container)
|
||||||
|
- **Magic-Doc**: Document conversion library
|
||||||
|
- **FastAPI**: Web framework
|
||||||
|
- **Uvicorn**: ASGI server
|
||||||
|
|
||||||
|
## Troubleshooting
|
||||||
|
|
||||||
|
### Service Won't Start
|
||||||
|
1. Check Docker is running
|
||||||
|
2. Verify port 8002 is available
|
||||||
|
3. Check logs: `docker-compose logs`
|
||||||
|
|
||||||
|
### File Conversion Fails
|
||||||
|
1. Verify LibreOffice is working in container
|
||||||
|
2. Check file format is supported
|
||||||
|
3. Review API logs for errors
|
||||||
|
|
||||||
|
### Integration Issues
|
||||||
|
1. Verify API endpoint URL
|
||||||
|
2. Check network connectivity between services
|
||||||
|
3. Ensure response format compatibility
|
||||||
|
|
||||||
|
## Performance Considerations
|
||||||
|
|
||||||
|
- MagicDoc is generally faster than Mineru for simple documents
|
||||||
|
- LibreOffice dependency adds container size
|
||||||
|
- Consider caching for repeated conversions
|
||||||
|
- Monitor memory usage for large files
|
||||||
|
|
||||||
|
## Security Notes
|
||||||
|
|
||||||
|
- Service runs on internal network
|
||||||
|
- File uploads are temporary
|
||||||
|
- No persistent storage of uploaded files
|
||||||
|
- Consider adding authentication for production use
|
||||||
|
|
@ -0,0 +1 @@
|
||||||
|
# MagicDoc FastAPI Application
|
||||||
|
|
@ -0,0 +1,96 @@
|
||||||
|
import os
|
||||||
|
import logging
|
||||||
|
from typing import Dict, Any, Optional
|
||||||
|
from fastapi import FastAPI, File, UploadFile, Form, HTTPException
|
||||||
|
from fastapi.responses import JSONResponse
|
||||||
|
from magic_doc.docconv import DocConverter, S3Config
|
||||||
|
import tempfile
|
||||||
|
import shutil
|
||||||
|
|
||||||
|
# Configure logging
|
||||||
|
logging.basicConfig(level=logging.INFO)
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
app = FastAPI(title="MagicDoc API", version="1.0.0")
|
||||||
|
|
||||||
|
# Global converter instance
|
||||||
|
converter = DocConverter(s3_config=None)
|
||||||
|
|
||||||
|
@app.get("/health")
|
||||||
|
async def health_check():
|
||||||
|
"""Health check endpoint"""
|
||||||
|
return {"status": "healthy", "service": "magicdoc-api"}
|
||||||
|
|
||||||
|
@app.post("/file_parse")
|
||||||
|
async def parse_file(
|
||||||
|
files: UploadFile = File(...),
|
||||||
|
output_dir: str = Form("./output"),
|
||||||
|
lang_list: str = Form("ch"),
|
||||||
|
backend: str = Form("pipeline"),
|
||||||
|
parse_method: str = Form("auto"),
|
||||||
|
formula_enable: bool = Form(True),
|
||||||
|
table_enable: bool = Form(True),
|
||||||
|
return_md: bool = Form(True),
|
||||||
|
return_middle_json: bool = Form(False),
|
||||||
|
return_model_output: bool = Form(False),
|
||||||
|
return_content_list: bool = Form(False),
|
||||||
|
return_images: bool = Form(False),
|
||||||
|
start_page_id: int = Form(0),
|
||||||
|
end_page_id: int = Form(99999)
|
||||||
|
):
|
||||||
|
"""
|
||||||
|
Parse document file and convert to markdown
|
||||||
|
Compatible with Mineru API interface
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
logger.info(f"Processing file: {files.filename}")
|
||||||
|
|
||||||
|
# Create temporary file to save uploaded content
|
||||||
|
with tempfile.NamedTemporaryFile(delete=False, suffix=os.path.splitext(files.filename)[1]) as temp_file:
|
||||||
|
shutil.copyfileobj(files.file, temp_file)
|
||||||
|
temp_file_path = temp_file.name
|
||||||
|
|
||||||
|
try:
|
||||||
|
# Convert file to markdown using magic-doc
|
||||||
|
markdown_content, time_cost = converter.convert(temp_file_path, conv_timeout=300)
|
||||||
|
|
||||||
|
logger.info(f"Successfully converted {files.filename} to markdown in {time_cost:.2f}s")
|
||||||
|
|
||||||
|
# Return response compatible with Mineru API
|
||||||
|
response = {
|
||||||
|
"markdown": markdown_content,
|
||||||
|
"md": markdown_content, # Alternative field name
|
||||||
|
"content": markdown_content, # Alternative field name
|
||||||
|
"text": markdown_content, # Alternative field name
|
||||||
|
"time_cost": time_cost,
|
||||||
|
"filename": files.filename,
|
||||||
|
"status": "success"
|
||||||
|
}
|
||||||
|
|
||||||
|
return JSONResponse(content=response)
|
||||||
|
|
||||||
|
finally:
|
||||||
|
# Clean up temporary file
|
||||||
|
if os.path.exists(temp_file_path):
|
||||||
|
os.unlink(temp_file_path)
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Error processing file {files.filename}: {str(e)}")
|
||||||
|
raise HTTPException(status_code=500, detail=f"Error processing file: {str(e)}")
|
||||||
|
|
||||||
|
@app.get("/")
|
||||||
|
async def root():
|
||||||
|
"""Root endpoint with service information"""
|
||||||
|
return {
|
||||||
|
"service": "MagicDoc API",
|
||||||
|
"version": "1.0.0",
|
||||||
|
"description": "Document to Markdown conversion service using Magic-Doc",
|
||||||
|
"endpoints": {
|
||||||
|
"health": "/health",
|
||||||
|
"file_parse": "/file_parse"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
import uvicorn
|
||||||
|
uvicorn.run(app, host="0.0.0.0", port=8000)
|
||||||
|
|
@ -0,0 +1,26 @@
|
||||||
|
version: '3.8'
|
||||||
|
|
||||||
|
services:
|
||||||
|
magicdoc-api:
|
||||||
|
build:
|
||||||
|
context: .
|
||||||
|
dockerfile: Dockerfile
|
||||||
|
platform: linux/amd64
|
||||||
|
ports:
|
||||||
|
- "8002:8000"
|
||||||
|
volumes:
|
||||||
|
- ./storage/uploads:/app/storage/uploads
|
||||||
|
- ./storage/processed:/app/storage/processed
|
||||||
|
environment:
|
||||||
|
- PYTHONUNBUFFERED=1
|
||||||
|
restart: unless-stopped
|
||||||
|
healthcheck:
|
||||||
|
test: ["CMD", "curl", "-f", "http://localhost:8000/health"]
|
||||||
|
interval: 30s
|
||||||
|
timeout: 10s
|
||||||
|
retries: 3
|
||||||
|
start_period: 60s
|
||||||
|
|
||||||
|
volumes:
|
||||||
|
uploads:
|
||||||
|
processed:
|
||||||
|
|
@ -0,0 +1,144 @@
|
||||||
|
"""
|
||||||
|
Example of how to integrate MagicDoc API with existing document processors
|
||||||
|
"""
|
||||||
|
|
||||||
|
# Example modification for docx_processor.py
|
||||||
|
# Replace the Mineru API configuration with MagicDoc API configuration
|
||||||
|
|
||||||
|
class DocxDocumentProcessor(DocumentProcessor):
|
||||||
|
def __init__(self, input_path: str, output_path: str):
|
||||||
|
super().__init__()
|
||||||
|
self.input_path = input_path
|
||||||
|
self.output_path = output_path
|
||||||
|
self.output_dir = os.path.dirname(output_path)
|
||||||
|
self.name_without_suff = os.path.splitext(os.path.basename(input_path))[0]
|
||||||
|
|
||||||
|
# Setup work directory for temporary files
|
||||||
|
self.work_dir = os.path.join(
|
||||||
|
os.path.dirname(output_path),
|
||||||
|
".work",
|
||||||
|
os.path.splitext(os.path.basename(input_path))[0]
|
||||||
|
)
|
||||||
|
os.makedirs(self.work_dir, exist_ok=True)
|
||||||
|
|
||||||
|
self.ollama_client = OllamaClient(model_name=settings.OLLAMA_MODEL, base_url=settings.OLLAMA_API_URL)
|
||||||
|
|
||||||
|
# MagicDoc API configuration (instead of Mineru)
|
||||||
|
self.magicdoc_base_url = getattr(settings, 'MAGICDOC_API_URL', 'http://magicdoc-api:8000')
|
||||||
|
self.magicdoc_timeout = getattr(settings, 'MAGICDOC_TIMEOUT', 300) # 5 minutes timeout
|
||||||
|
|
||||||
|
def _call_magicdoc_api(self, file_path: str) -> Optional[Dict[str, Any]]:
|
||||||
|
"""
|
||||||
|
Call MagicDoc API to convert DOCX to markdown
|
||||||
|
|
||||||
|
Args:
|
||||||
|
file_path: Path to the DOCX file
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
API response as dictionary or None if failed
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
url = f"{self.magicdoc_base_url}/file_parse"
|
||||||
|
|
||||||
|
with open(file_path, 'rb') as file:
|
||||||
|
files = {'files': (os.path.basename(file_path), file, 'application/vnd.openxmlformats-officedocument.wordprocessingml.document')}
|
||||||
|
|
||||||
|
# Prepare form data - simplified compared to Mineru
|
||||||
|
data = {
|
||||||
|
'output_dir': './output',
|
||||||
|
'lang_list': 'ch',
|
||||||
|
'backend': 'pipeline',
|
||||||
|
'parse_method': 'auto',
|
||||||
|
'formula_enable': True,
|
||||||
|
'table_enable': True,
|
||||||
|
'return_md': True,
|
||||||
|
'return_middle_json': False,
|
||||||
|
'return_model_output': False,
|
||||||
|
'return_content_list': False,
|
||||||
|
'return_images': False,
|
||||||
|
'start_page_id': 0,
|
||||||
|
'end_page_id': 99999
|
||||||
|
}
|
||||||
|
|
||||||
|
logger.info(f"Calling MagicDoc API for DOCX processing at {url}")
|
||||||
|
response = requests.post(
|
||||||
|
url,
|
||||||
|
files=files,
|
||||||
|
data=data,
|
||||||
|
timeout=self.magicdoc_timeout
|
||||||
|
)
|
||||||
|
|
||||||
|
if response.status_code == 200:
|
||||||
|
result = response.json()
|
||||||
|
logger.info("Successfully received response from MagicDoc API for DOCX")
|
||||||
|
return result
|
||||||
|
else:
|
||||||
|
error_msg = f"MagicDoc API returned status code {response.status_code}: {response.text}"
|
||||||
|
logger.error(error_msg)
|
||||||
|
raise Exception(error_msg)
|
||||||
|
|
||||||
|
except requests.exceptions.Timeout:
|
||||||
|
error_msg = f"MagicDoc API request timed out after {self.magicdoc_timeout} seconds"
|
||||||
|
logger.error(error_msg)
|
||||||
|
raise Exception(error_msg)
|
||||||
|
except requests.exceptions.RequestException as e:
|
||||||
|
error_msg = f"Error calling MagicDoc API for DOCX: {str(e)}"
|
||||||
|
logger.error(error_msg)
|
||||||
|
raise Exception(error_msg)
|
||||||
|
except Exception as e:
|
||||||
|
error_msg = f"Unexpected error calling MagicDoc API for DOCX: {str(e)}"
|
||||||
|
logger.error(error_msg)
|
||||||
|
raise Exception(error_msg)
|
||||||
|
|
||||||
|
def read_content(self) -> str:
|
||||||
|
logger.info("Starting DOCX content processing with MagicDoc API")
|
||||||
|
|
||||||
|
# Call MagicDoc API to convert DOCX to markdown
|
||||||
|
magicdoc_response = self._call_magicdoc_api(self.input_path)
|
||||||
|
|
||||||
|
# Extract markdown content from the response
|
||||||
|
markdown_content = self._extract_markdown_from_response(magicdoc_response)
|
||||||
|
|
||||||
|
if not markdown_content:
|
||||||
|
raise Exception("No markdown content found in MagicDoc API response for DOCX")
|
||||||
|
|
||||||
|
logger.info(f"Successfully extracted {len(markdown_content)} characters of markdown content from DOCX")
|
||||||
|
|
||||||
|
# Save the raw markdown content to work directory for reference
|
||||||
|
md_output_path = os.path.join(self.work_dir, f"{self.name_without_suff}.md")
|
||||||
|
with open(md_output_path, 'w', encoding='utf-8') as file:
|
||||||
|
file.write(markdown_content)
|
||||||
|
|
||||||
|
logger.info(f"Saved raw markdown content from DOCX to {md_output_path}")
|
||||||
|
|
||||||
|
return markdown_content
|
||||||
|
|
||||||
|
# Configuration changes needed in settings.py:
|
||||||
|
"""
|
||||||
|
# Add these settings to your configuration
|
||||||
|
MAGICDOC_API_URL = "http://magicdoc-api:8000" # or http://localhost:8002 for local development
|
||||||
|
MAGICDOC_TIMEOUT = 300 # 5 minutes timeout
|
||||||
|
"""
|
||||||
|
|
||||||
|
# Docker Compose integration:
|
||||||
|
"""
|
||||||
|
# Add to your main docker-compose.yml
|
||||||
|
services:
|
||||||
|
magicdoc-api:
|
||||||
|
build:
|
||||||
|
context: ./magicdoc
|
||||||
|
dockerfile: Dockerfile
|
||||||
|
ports:
|
||||||
|
- "8002:8000"
|
||||||
|
volumes:
|
||||||
|
- ./magicdoc/storage:/app/storage
|
||||||
|
environment:
|
||||||
|
- PYTHONUNBUFFERED=1
|
||||||
|
restart: unless-stopped
|
||||||
|
healthcheck:
|
||||||
|
test: ["CMD", "curl", "-f", "http://localhost:8000/health"]
|
||||||
|
interval: 30s
|
||||||
|
timeout: 10s
|
||||||
|
retries: 3
|
||||||
|
start_period: 60s
|
||||||
|
"""
|
||||||
|
|
@ -0,0 +1,7 @@
|
||||||
|
fastapi==0.104.1
|
||||||
|
uvicorn[standard]==0.24.0
|
||||||
|
python-multipart==0.0.6
|
||||||
|
# fairy-doc[cpu]==0.1.0
|
||||||
|
pydantic==2.5.0
|
||||||
|
numpy==1.24.3
|
||||||
|
opencv-python==4.8.1.78
|
||||||
|
|
@ -0,0 +1,34 @@
|
||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
# MagicDoc API Service Startup Script
|
||||||
|
|
||||||
|
echo "Starting MagicDoc API Service..."
|
||||||
|
|
||||||
|
# Check if Docker is running
|
||||||
|
if ! docker info > /dev/null 2>&1; then
|
||||||
|
echo "Error: Docker is not running. Please start Docker first."
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Build and start the service
|
||||||
|
echo "Building and starting MagicDoc API service..."
|
||||||
|
docker-compose up --build -d
|
||||||
|
|
||||||
|
# Wait for service to be ready
|
||||||
|
echo "Waiting for service to be ready..."
|
||||||
|
sleep 10
|
||||||
|
|
||||||
|
# Check health
|
||||||
|
echo "Checking service health..."
|
||||||
|
if curl -f http://localhost:8002/health > /dev/null 2>&1; then
|
||||||
|
echo "✅ MagicDoc API service is running successfully!"
|
||||||
|
echo "🌐 Service URL: http://localhost:8002"
|
||||||
|
echo "📖 API Documentation: http://localhost:8002/docs"
|
||||||
|
echo "🔍 Health Check: http://localhost:8002/health"
|
||||||
|
else
|
||||||
|
echo "❌ Service health check failed. Check logs with: docker-compose logs"
|
||||||
|
fi
|
||||||
|
|
||||||
|
echo ""
|
||||||
|
echo "To stop the service, run: docker-compose down"
|
||||||
|
echo "To view logs, run: docker-compose logs -f"
|
||||||
|
|
@ -0,0 +1,92 @@
|
||||||
|
#!/usr/bin/env python3
|
||||||
|
"""
|
||||||
|
Test script for MagicDoc API
|
||||||
|
"""
|
||||||
|
|
||||||
|
import requests
|
||||||
|
import json
|
||||||
|
import os
|
||||||
|
|
||||||
|
def test_health_check(base_url="http://localhost:8002"):
|
||||||
|
"""Test health check endpoint"""
|
||||||
|
try:
|
||||||
|
response = requests.get(f"{base_url}/health")
|
||||||
|
print(f"Health check status: {response.status_code}")
|
||||||
|
print(f"Response: {response.json()}")
|
||||||
|
return response.status_code == 200
|
||||||
|
except Exception as e:
|
||||||
|
print(f"Health check failed: {e}")
|
||||||
|
return False
|
||||||
|
|
||||||
|
def test_file_parse(base_url="http://localhost:8002", file_path=None):
|
||||||
|
"""Test file parse endpoint"""
|
||||||
|
if not file_path or not os.path.exists(file_path):
|
||||||
|
print(f"File not found: {file_path}")
|
||||||
|
return False
|
||||||
|
|
||||||
|
try:
|
||||||
|
with open(file_path, 'rb') as f:
|
||||||
|
files = {'files': (os.path.basename(file_path), f, 'application/octet-stream')}
|
||||||
|
data = {
|
||||||
|
'output_dir': './output',
|
||||||
|
'lang_list': 'ch',
|
||||||
|
'backend': 'pipeline',
|
||||||
|
'parse_method': 'auto',
|
||||||
|
'formula_enable': True,
|
||||||
|
'table_enable': True,
|
||||||
|
'return_md': True,
|
||||||
|
'return_middle_json': False,
|
||||||
|
'return_model_output': False,
|
||||||
|
'return_content_list': False,
|
||||||
|
'return_images': False,
|
||||||
|
'start_page_id': 0,
|
||||||
|
'end_page_id': 99999
|
||||||
|
}
|
||||||
|
|
||||||
|
response = requests.post(f"{base_url}/file_parse", files=files, data=data)
|
||||||
|
print(f"File parse status: {response.status_code}")
|
||||||
|
|
||||||
|
if response.status_code == 200:
|
||||||
|
result = response.json()
|
||||||
|
print(f"Success! Converted {len(result.get('markdown', ''))} characters")
|
||||||
|
print(f"Time cost: {result.get('time_cost', 'N/A')}s")
|
||||||
|
return True
|
||||||
|
else:
|
||||||
|
print(f"Error: {response.text}")
|
||||||
|
return False
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
print(f"File parse failed: {e}")
|
||||||
|
return False
|
||||||
|
|
||||||
|
def main():
|
||||||
|
"""Main test function"""
|
||||||
|
print("Testing MagicDoc API...")
|
||||||
|
|
||||||
|
# Test health check
|
||||||
|
print("\n1. Testing health check...")
|
||||||
|
if not test_health_check():
|
||||||
|
print("Health check failed. Make sure the service is running.")
|
||||||
|
return
|
||||||
|
|
||||||
|
# Test file parse (if sample file exists)
|
||||||
|
print("\n2. Testing file parse...")
|
||||||
|
sample_files = [
|
||||||
|
"../sample_doc/20220707_na_decision-2.docx",
|
||||||
|
"../sample_doc/20220707_na_decision-2.pdf",
|
||||||
|
"../sample_doc/short_doc.md"
|
||||||
|
]
|
||||||
|
|
||||||
|
for sample_file in sample_files:
|
||||||
|
if os.path.exists(sample_file):
|
||||||
|
print(f"Testing with {sample_file}...")
|
||||||
|
if test_file_parse(file_path=sample_file):
|
||||||
|
print("File parse test passed!")
|
||||||
|
break
|
||||||
|
else:
|
||||||
|
print(f"Sample file not found: {sample_file}")
|
||||||
|
|
||||||
|
print("\nTest completed!")
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
Loading…
Reference in New Issue