Compare commits

...

4 Commits

Author SHA1 Message Date
tigermren 2075218955 feat: 正式fully支持docx 2025-08-18 01:15:40 +08:00
tigermren afddcf4dd7 fix: 解决magic-doc包的问题 2025-08-18 01:01:58 +08:00
tigermren 0820d7bba2 feat:新增magicdoc 2025-08-18 00:40:39 +08:00
tigermren a16b69475e refine: 整理文件 2025-08-17 23:33:56 +08:00
19 changed files with 747 additions and 195 deletions

View File

@ -26,18 +26,19 @@ class DocxDocumentProcessor(DocumentProcessor):
self.ollama_client = OllamaClient(model_name=settings.OLLAMA_MODEL, base_url=settings.OLLAMA_API_URL)
# Mineru API configuration
self.mineru_base_url = getattr(settings, 'MINERU_API_URL', 'http://mineru-api:8000')
self.mineru_timeout = getattr(settings, 'MINERU_TIMEOUT', 300) # 5 minutes timeout
self.mineru_lang_list = getattr(settings, 'MINERU_LANG_LIST', ['ch'])
self.mineru_backend = getattr(settings, 'MINERU_BACKEND', 'pipeline')
self.mineru_parse_method = getattr(settings, 'MINERU_PARSE_METHOD', 'auto')
self.mineru_formula_enable = getattr(settings, 'MINERU_FORMULA_ENABLE', True)
self.mineru_table_enable = getattr(settings, 'MINERU_TABLE_ENABLE', True)
# MagicDoc API configuration (replacing Mineru)
self.magicdoc_base_url = getattr(settings, 'MAGICDOC_API_URL', 'http://magicdoc-api:8000')
self.magicdoc_timeout = getattr(settings, 'MAGICDOC_TIMEOUT', 300) # 5 minutes timeout
# MagicDoc uses simpler parameters, but we keep compatibility with existing interface
self.magicdoc_lang_list = getattr(settings, 'MAGICDOC_LANG_LIST', 'ch')
self.magicdoc_backend = getattr(settings, 'MAGICDOC_BACKEND', 'pipeline')
self.magicdoc_parse_method = getattr(settings, 'MAGICDOC_PARSE_METHOD', 'auto')
self.magicdoc_formula_enable = getattr(settings, 'MAGICDOC_FORMULA_ENABLE', True)
self.magicdoc_table_enable = getattr(settings, 'MAGICDOC_TABLE_ENABLE', True)
def _call_mineru_api(self, file_path: str) -> Optional[Dict[str, Any]]:
def _call_magicdoc_api(self, file_path: str) -> Optional[Dict[str, Any]]:
"""
Call Mineru API to convert DOCX to markdown
Call MagicDoc API to convert DOCX to markdown
Args:
file_path: Path to the DOCX file
@ -46,19 +47,19 @@ class DocxDocumentProcessor(DocumentProcessor):
API response as dictionary or None if failed
"""
try:
url = f"{self.mineru_base_url}/file_parse"
url = f"{self.magicdoc_base_url}/file_parse"
with open(file_path, 'rb') as file:
files = {'files': (os.path.basename(file_path), file, 'application/vnd.openxmlformats-officedocument.wordprocessingml.document')}
# Prepare form data according to Mineru API specification
# Prepare form data according to MagicDoc API specification (compatible with Mineru)
data = {
'output_dir': './output',
'lang_list': self.mineru_lang_list,
'backend': self.mineru_backend,
'parse_method': self.mineru_parse_method,
'formula_enable': self.mineru_formula_enable,
'table_enable': self.mineru_table_enable,
'lang_list': self.magicdoc_lang_list,
'backend': self.magicdoc_backend,
'parse_method': self.magicdoc_parse_method,
'formula_enable': self.magicdoc_formula_enable,
'table_enable': self.magicdoc_table_enable,
'return_md': True,
'return_middle_json': False,
'return_model_output': False,
@ -68,58 +69,58 @@ class DocxDocumentProcessor(DocumentProcessor):
'end_page_id': 99999
}
logger.info(f"Calling Mineru API for DOCX processing at {url}")
logger.info(f"Calling MagicDoc API for DOCX processing at {url}")
response = requests.post(
url,
files=files,
data=data,
timeout=self.mineru_timeout
timeout=self.magicdoc_timeout
)
if response.status_code == 200:
result = response.json()
logger.info("Successfully received response from Mineru API for DOCX")
logger.info("Successfully received response from MagicDoc API for DOCX")
return result
else:
error_msg = f"Mineru API returned status code {response.status_code}: {response.text}"
error_msg = f"MagicDoc API returned status code {response.status_code}: {response.text}"
logger.error(error_msg)
# For 400 errors, include more specific information
if response.status_code == 400:
try:
error_data = response.json()
if 'error' in error_data:
error_msg = f"Mineru API error: {error_data['error']}"
error_msg = f"MagicDoc API error: {error_data['error']}"
except:
pass
raise Exception(error_msg)
except requests.exceptions.Timeout:
error_msg = f"Mineru API request timed out after {self.mineru_timeout} seconds"
error_msg = f"MagicDoc API request timed out after {self.magicdoc_timeout} seconds"
logger.error(error_msg)
raise Exception(error_msg)
except requests.exceptions.RequestException as e:
error_msg = f"Error calling Mineru API for DOCX: {str(e)}"
error_msg = f"Error calling MagicDoc API for DOCX: {str(e)}"
logger.error(error_msg)
raise Exception(error_msg)
except Exception as e:
error_msg = f"Unexpected error calling Mineru API for DOCX: {str(e)}"
error_msg = f"Unexpected error calling MagicDoc API for DOCX: {str(e)}"
logger.error(error_msg)
raise Exception(error_msg)
def _extract_markdown_from_response(self, response: Dict[str, Any]) -> str:
"""
Extract markdown content from Mineru API response
Extract markdown content from MagicDoc API response
Args:
response: Mineru API response dictionary
response: MagicDoc API response dictionary
Returns:
Extracted markdown content as string
"""
try:
logger.debug(f"Mineru API response structure for DOCX: {response}")
logger.debug(f"MagicDoc API response structure for DOCX: {response}")
# Try different possible response formats based on Mineru API
# Try different possible response formats based on MagicDoc API
if 'markdown' in response:
return response['markdown']
elif 'md' in response:
@ -157,7 +158,7 @@ class DocxDocumentProcessor(DocumentProcessor):
return first_item
else:
# If no standard format found, try to extract from the response structure
logger.warning("Could not find standard markdown field in Mineru response for DOCX")
logger.warning("Could not find standard markdown field in MagicDoc response for DOCX")
# Return the response as string if it's simple, or empty string
if isinstance(response, str):
@ -176,21 +177,21 @@ class DocxDocumentProcessor(DocumentProcessor):
return ""
except Exception as e:
logger.error(f"Error extracting markdown from Mineru response for DOCX: {str(e)}")
logger.error(f"Error extracting markdown from MagicDoc response for DOCX: {str(e)}")
return ""
def read_content(self) -> str:
logger.info("Starting DOCX content processing with Mineru API")
logger.info("Starting DOCX content processing with MagicDoc API")
# Call Mineru API to convert DOCX to markdown
# Call MagicDoc API to convert DOCX to markdown
# This will raise an exception if the API call fails
mineru_response = self._call_mineru_api(self.input_path)
magicdoc_response = self._call_magicdoc_api(self.input_path)
# Extract markdown content from the response
markdown_content = self._extract_markdown_from_response(mineru_response)
markdown_content = self._extract_markdown_from_response(magicdoc_response)
if not markdown_content:
raise Exception("No markdown content found in Mineru API response for DOCX")
raise Exception("No markdown content found in MagicDoc API response for DOCX")
logger.info(f"Successfully extracted {len(markdown_content)} characters of markdown content from DOCX")

View File

@ -1,127 +0,0 @@
[2025-07-14 14:20:19,015: INFO/ForkPoolWorker-4] Raw response from LLM: {
celery_worker-1 | "entities": []
celery_worker-1 | }
celery_worker-1 | [2025-07-14 14:20:19,016: INFO/ForkPoolWorker-4] Parsed mapping: {'entities': []}
celery_worker-1 | [2025-07-14 14:20:19,020: INFO/ForkPoolWorker-4] Calling ollama to generate case numbers mapping for chunk (attempt 1/3):
celery_worker-1 | 你是一个专业的法律文本实体识别助手。请从以下文本中抽取出所有需要脱敏的敏感信息并按照指定的类别进行分类。请严格按照JSON格式输出结果。
celery_worker-1 |
celery_worker-1 | 实体类别包括:
celery_worker-1 | - 案号
celery_worker-1 |
celery_worker-1 | 待处理文本:
celery_worker-1 |
celery_worker-1 |
celery_worker-1 | 二审案件受理费450892 元,由北京丰复久信营销科技有限公司负担(已交纳)。
celery_worker-1 |
celery_worker-1 | 29. 本判决为终审判决。
celery_worker-1 |
celery_worker-1 | 审 判 长 史晓霞审 判 员 邓青菁审 判 员 李 淼二〇二二年七月七日法 官 助 理 黎 铧书 记 员 郑海兴
celery_worker-1 |
celery_worker-1 | 输出格式:
celery_worker-1 | {
celery_worker-1 | "entities": [
celery_worker-1 | {"text": "原始文本内容", "type": "案号"},
celery_worker-1 | ...
celery_worker-1 | ]
celery_worker-1 | }
celery_worker-1 |
celery_worker-1 | 请严格按照JSON格式输出结果。
celery_worker-1 |
api-1 | INFO: 192.168.65.1:60045 - "GET /api/v1/files/files HTTP/1.1" 200 OK
api-1 | INFO: 192.168.65.1:34054 - "GET /api/v1/files/files HTTP/1.1" 200 OK
api-1 | INFO: 192.168.65.1:34054 - "GET /api/v1/files/files HTTP/1.1" 200 OK
api-1 | INFO: 192.168.65.1:22084 - "GET /api/v1/files/files HTTP/1.1" 200 OK
celery_worker-1 | [2025-07-14 14:20:31,279: INFO/ForkPoolWorker-4] Raw response from LLM: {
celery_worker-1 | "entities": []
celery_worker-1 | }
celery_worker-1 | [2025-07-14 14:20:31,281: INFO/ForkPoolWorker-4] Parsed mapping: {'entities': []}
celery_worker-1 | [2025-07-14 14:20:31,287: INFO/ForkPoolWorker-4] Chunk mapping: [{'entities': []}, {'entities': [{'text': '北京丰复久信营销科技有限公司', 'type': '公司名称'}]}, {'entities': []}, {'entities': []}, {'entities': []}]
celery_worker-1 | [2025-07-14 14:20:31,288: INFO/ForkPoolWorker-4] Final chunk mappings: [{'entities': [{'text': '郭东军', 'type': '人名'}, {'text': '王欢子', 'type': '人名'}]}, {'entities': [{'text': '北京丰复久信营销科技有限公司', 'type': '公司名称'}, {'text': '丰复久信公司', 'type': '公司名称简称'}, {'text': '中研智创区块链技术有限公司', 'type': '公司名称'}, {'text': '中研智才公司', 'type': '公司名称简称'}]}, {'entities': [{'text': '北京市海淀区北小马厂6 号1 号楼华天大厦1306 室', 'type': '地址'}, {'text': '天津市津南区双港镇工业园区优谷产业园5 号楼-1505', 'type': '地址'}]}, {'entities': [{'text': '服务合同', 'type': '项目名'}]}, {'entities': [{'text': '(2022)京 03 民终 3852 号', 'type': '案号'}, {'text': '2020京0105 民初69754 号', 'type': '案号'}]}, {'entities': [{'text': '李圣艳', 'type': '人名'}, {'text': '闫向东', 'type': '人名'}, {'text': '李敏', 'type': '人名'}, {'text': '布兰登·斯密特', 'type': '英文人名'}]}, {'entities': [{'text': '丰复久信公司', 'type': '公司名称'}, {'text': '中研智创公司', 'type': '公司名称'}, {'text': '丰复久信', 'type': '公司名称简称'}, {'text': '中研智创', 'type': '公司名称简称'}]}, {'entities': [{'text': '上海市', 'type': '地址'}, {'text': '北京', 'type': '地址'}]}, {'entities': [{'text': '《计算机设备采购合同》', 'type': '项目名'}]}, {'entities': []}, {'entities': []}, {'entities': [{'text': '丰复久信公司', 'type': '公司名称'}, {'text': '中研智创公司', 'type': '公司名称'}]}, {'entities': []}, {'entities': [{'text': '《服务合同书》', 'type': '项目名'}]}, {'entities': []}, {'entities': []}, {'entities': [{'text': '北京丰复久信营销科技有限公司', 'type': '公司名称'}]}, {'entities': []}, {'entities': []}, {'entities': []}]
celery_worker-1 | [2025-07-14 14:20:31,288: INFO/ForkPoolWorker-4] Duplicate entity found: {'text': '丰复久信公司', 'type': '公司名称'}
celery_worker-1 | [2025-07-14 14:20:31,288: INFO/ForkPoolWorker-4] Duplicate entity found: {'text': '丰复久信公司', 'type': '公司名称'}
celery_worker-1 | [2025-07-14 14:20:31,288: INFO/ForkPoolWorker-4] Duplicate entity found: {'text': '中研智创公司', 'type': '公司名称'}
celery_worker-1 | [2025-07-14 14:20:31,288: INFO/ForkPoolWorker-4] Duplicate entity found: {'text': '北京丰复久信营销科技有限公司', 'type': '公司名称'}
celery_worker-1 | [2025-07-14 14:20:31,288: INFO/ForkPoolWorker-4] Merged 22 unique entities
celery_worker-1 | [2025-07-14 14:20:31,288: INFO/ForkPoolWorker-4] Unique entities: [{'text': '郭东军', 'type': '人名'}, {'text': '王欢子', 'type': '人名'}, {'text': '北京丰复久信营销科技有限公司', 'type': '公司名称'}, {'text': '丰复久信公司', 'type': '公司名称简称'}, {'text': '中研智创区块链技术有限公司', 'type': '公司名称'}, {'text': '中研智才公司', 'type': '公司名称简称'}, {'text': '北京市海淀区北小马厂6 号1 号楼华天大厦1306 室', 'type': '地址'}, {'text': '天津市津南区双港镇工业园区优谷产业园5 号楼-1505', 'type': '地址'}, {'text': '服务合同', 'type': '项目名'}, {'text': '(2022)京 03 民终 3852 号', 'type': '案号'}, {'text': '2020京0105 民初69754 号', 'type': '案号'}, {'text': '李圣艳', 'type': '人名'}, {'text': '闫向东', 'type': '人名'}, {'text': '李敏', 'type': '人名'}, {'text': '布兰登·斯密特', 'type': '英文人名'}, {'text': '中研智创公司', 'type': '公司名称'}, {'text': '丰复久信', 'type': '公司名称简称'}, {'text': '中研智创', 'type': '公司名称简称'}, {'text': '上海市', 'type': '地址'}, {'text': '北京', 'type': '地址'}, {'text': '《计算机设备采购合同》', 'type': '项目名'}, {'text': '《服务合同书》', 'type': '项目名'}]
celery_worker-1 | [2025-07-14 14:20:31,289: INFO/ForkPoolWorker-4] Calling ollama to generate entity linkage (attempt 1/3)
api-1 | INFO: 192.168.65.1:52168 - "GET /api/v1/files/files HTTP/1.1" 200 OK
api-1 | INFO: 192.168.65.1:61426 - "GET /api/v1/files/files HTTP/1.1" 200 OK
api-1 | INFO: 192.168.65.1:30702 - "GET /api/v1/files/files HTTP/1.1" 200 OK
api-1 | INFO: 192.168.65.1:48159 - "GET /api/v1/files/files HTTP/1.1" 200 OK
api-1 | INFO: 192.168.65.1:16860 - "GET /api/v1/files/files HTTP/1.1" 200 OK
api-1 | INFO: 192.168.65.1:21262 - "GET /api/v1/files/files HTTP/1.1" 200 OK
api-1 | INFO: 192.168.65.1:45564 - "GET /api/v1/files/files HTTP/1.1" 200 OK
api-1 | INFO: 192.168.65.1:32142 - "GET /api/v1/files/files HTTP/1.1" 200 OK
api-1 | INFO: 192.168.65.1:27769 - "GET /api/v1/files/files HTTP/1.1" 200 OK
api-1 | INFO: 192.168.65.1:21196 - "GET /api/v1/files/files HTTP/1.1" 200 OK
celery_worker-1 | [2025-07-14 14:21:21,436: INFO/ForkPoolWorker-4] Raw entity linkage response from LLM: {
celery_worker-1 | "entity_groups": [
celery_worker-1 | {
celery_worker-1 | "group_id": "group_1",
celery_worker-1 | "group_type": "公司名称",
celery_worker-1 | "entities": [
celery_worker-1 | {
celery_worker-1 | "text": "北京丰复久信营销科技有限公司",
celery_worker-1 | "type": "公司名称",
celery_worker-1 | "is_primary": true
celery_worker-1 | },
celery_worker-1 | {
celery_worker-1 | "text": "丰复久信公司",
celery_worker-1 | "type": "公司名称简称",
celery_worker-1 | "is_primary": false
celery_worker-1 | },
celery_worker-1 | {
celery_worker-1 | "text": "丰复久信",
celery_worker-1 | "type": "公司名称简称",
celery_worker-1 | "is_primary": false
celery_worker-1 | }
celery_worker-1 | ]
celery_worker-1 | },
celery_worker-1 | {
celery_worker-1 | "group_id": "group_2",
celery_worker-1 | "group_type": "公司名称",
celery_worker-1 | "entities": [
celery_worker-1 | {
celery_worker-1 | "text": "中研智创区块链技术有限公司",
celery_worker-1 | "type": "公司名称",
celery_worker-1 | "is_primary": true
celery_worker-1 | },
celery_worker-1 | {
celery_worker-1 | "text": "中研智创公司",
celery_worker-1 | "type": "公司名称简称",
celery_worker-1 | "is_primary": false
celery_worker-1 | },
celery_worker-1 | {
celery_worker-1 | "text": "中研智创",
celery_worker-1 | "type": "公司名称简称",
celery_worker-1 | "is_primary": false
celery_worker-1 | }
celery_worker-1 | ]
celery_worker-1 | }
celery_worker-1 | ]
celery_worker-1 | }
celery_worker-1 | [2025-07-14 14:21:21,437: INFO/ForkPoolWorker-4] Parsed entity linkage: {'entity_groups': [{'group_id': 'group_1', 'group_type': '公司名称', 'entities': [{'text': '北京丰复久信营销科技有限公司', 'type': '公司名称', 'is_primary': True}, {'text': '丰复久信公司', 'type': '公司名称简称', 'is_primary': False}, {'text': '丰复久信', 'type': '公司名称简称', 'is_primary': False}]}, {'group_id': 'group_2', 'group_type': '公司名称', 'entities': [{'text': '中研智创区块链技术有限公司', 'type': '公司名称', 'is_primary': True}, {'text': '中研智创公司', 'type': '公司名称简称', 'is_primary': False}, {'text': '中研智创', 'type': '公司名称简称', 'is_primary': False}]}]}
celery_worker-1 | [2025-07-14 14:21:21,445: INFO/ForkPoolWorker-4] Successfully created entity linkage with 2 groups
celery_worker-1 | [2025-07-14 14:21:21,445: INFO/ForkPoolWorker-4] Entity linkage: {'entity_groups': [{'group_id': 'group_1', 'group_type': '公司名称', 'entities': [{'text': '北京丰复久信营销科技有限公司', 'type': '公司名称', 'is_primary': True}, {'text': '丰复久信公司', 'type': '公司名称简称', 'is_primary': False}, {'text': '丰复久信', 'type': '公司名称简称', 'is_primary': False}]}, {'group_id': 'group_2', 'group_type': '公司名称', 'entities': [{'text': '中研智创区块链技术有限公司', 'type': '公司名称', 'is_primary': True}, {'text': '中研智创公司', 'type': '公司名称简称', 'is_primary': False}, {'text': '中研智创', 'type': '公司名称简称', 'is_primary': False}]}]}
celery_worker-1 | [2025-07-14 14:21:21,446: INFO/ForkPoolWorker-4] Generated masked mapping for 22 entities
celery_worker-1 | [2025-07-14 14:21:21,446: INFO/ForkPoolWorker-4] Combined mapping: {'郭东军': '某', '王欢子': '某甲', '北京丰复久信营销科技有限公司': '某公司', '丰复久信公司': '某公司甲', '中研智创区块链技术有限公司': '某公司乙', '中研智才公司': '某公司丙', '北京市海淀区北小马厂6 号1 号楼华天大厦1306 室': '某乙', '天津市津南区双港镇工业园区优谷产业园5 号楼-1505': '某丙', '服务合同': '某丁', '(2022)京 03 民终 3852 号': '某戊', '2020京0105 民初69754 号': '某己', '李圣艳': '某庚', '闫向东': '某辛', '李敏': '某壬', '布兰登·斯密特': '某癸', '中研智创公司': '某公司丁', '丰复久信': '某公司戊', '中研智创': '某公司己', '上海市': '某11', '北京': '某12', '《计算机设备采购合同》': '某13', '《服务合同书》': '某14'}
celery_worker-1 | [2025-07-14 14:21:21,446: INFO/ForkPoolWorker-4] Linked entity '北京丰复久信营销科技有限公司' to '北京丰复久信营销科技有限公司' with masked name '某公司'
celery_worker-1 | [2025-07-14 14:21:21,446: INFO/ForkPoolWorker-4] Linked entity '丰复久信公司' to '北京丰复久信营销科技有限公司' with masked name '某公司'
celery_worker-1 | [2025-07-14 14:21:21,446: INFO/ForkPoolWorker-4] Linked entity '丰复久信' to '北京丰复久信营销科技有限公司' with masked name '某公司'
celery_worker-1 | [2025-07-14 14:21:21,446: INFO/ForkPoolWorker-4] Linked entity '中研智创区块链技术有限公司' to '中研智创区块链技术有限公司' with masked name '某公司乙'
celery_worker-1 | [2025-07-14 14:21:21,446: INFO/ForkPoolWorker-4] Linked entity '中研智创公司' to '中研智创区块链技术有限公司' with masked name '某公司乙'
celery_worker-1 | [2025-07-14 14:21:21,446: INFO/ForkPoolWorker-4] Linked entity '中研智创' to '中研智创区块链技术有限公司' with masked name '某公司乙'
celery_worker-1 | [2025-07-14 14:21:21,446: INFO/ForkPoolWorker-4] Final mapping: {'郭东军': '某', '王欢子': '某甲', '北京丰复久信营销科技有限公司': '某公司', '丰复久信公司': '某公司', '中研智创区块链技术有限公司': '某公司乙', '中研智才公司': '某公司丙', '北京市海淀区北小马厂6 号1 号楼华天大厦1306 室': '某乙', '天津市津南区双港镇工业园区优谷产业园5 号楼-1505': '某丙', '服务合同': '某丁', '(2022)京 03 民终 3852 号': '某戊', '2020京0105 民初69754 号': '某己', '李圣艳': '某庚', '闫向东': '某辛', '李敏': '某壬', '布兰登·斯密特': '某癸', '中研智创公司': '某公司乙', '丰复久信': '某公司', '中研智创': '某公司乙', '上海市': '某11', '北京': '某12', '《计算机设备采购合同》': '某13', '《服务合同书》': '某14'}
celery_worker-1 | [2025-07-14 14:21:21,446: INFO/ForkPoolWorker-4] Successfully masked content
celery_worker-1 | [2025-07-14 14:21:21,449: INFO/ForkPoolWorker-4] Successfully saved masked content to /app/storage/processed/47522ea9-c259-4304-bfe4-1d3ed6902ede.md
celery_worker-1 | [2025-07-14 14:21:21,470: INFO/ForkPoolWorker-4] Task app.services.file_service.process_file[5cfbca4c-0f6f-4c71-a66b-b22ee2d28139] succeeded in 311.847165101s: None
api-1 | INFO: 192.168.65.1:33432 - "GET /api/v1/files/files HTTP/1.1" 200 OK
api-1 | INFO: 192.168.65.1:40073 - "GET /api/v1/files/files HTTP/1.1" 200 OK
api-1 | INFO: 192.168.65.1:29550 - "GET /api/v1/files/files HTTP/1.1" 200 OK
api-1 | INFO: 192.168.65.1:61350 - "GET /api/v1/files/files HTTP/1.1" 200 OK
api-1 | INFO: 192.168.65.1:61755 - "GET /api/v1/files/files HTTP/1.1" 200 OK
api-1 | INFO: 192.168.65.1:63726 - "GET /api/v1/files/files HTTP/1.1" 200 OK
api-1 | INFO: 192.168.65.1:43446 - "GET /api/v1/files/files HTTP/1.1" 200 OK
api-1 | INFO: 192.168.65.1:45624 - "GET /api/v1/files/files HTTP/1.1" 200 OK
api-1 | INFO: 192.168.65.1:25256 - "GET /api/v1/files/files HTTP/1.1" 200 OK
api-1 | INFO: 192.168.65.1:43464 - "GET /api/v1/files/files HTTP/1.1" 200 OK

View File

@ -1,32 +0,0 @@
#!/usr/bin/env python3
"""
Simple test runner script to verify test discovery and execution
"""
import subprocess
import sys
import os
from pathlib import Path
def run_tests():
"""Run pytest with proper configuration"""
# Change to backend directory
backend_dir = Path(__file__).parent
os.chdir(backend_dir)
# Run pytest
cmd = [sys.executable, "-m", "pytest", "tests/", "-v", "--tb=short"]
print(f"Running tests from: {backend_dir}")
print(f"Command: {' '.join(cmd)}")
print("-" * 50)
try:
result = subprocess.run(cmd, capture_output=False, text=True)
return result.returncode
except Exception as e:
print(f"Error running tests: {e}")
return 1
if __name__ == "__main__":
exit_code = run_tests()
sys.exit(exit_code)

View File

@ -25,6 +25,29 @@ services:
networks:
- app-network
# MagicDoc API Service
magicdoc-api:
build:
context: ./magicdoc
dockerfile: Dockerfile
platform: linux/amd64
ports:
- "8002:8000"
volumes:
- ./magicdoc/storage/uploads:/app/storage/uploads
- ./magicdoc/storage/processed:/app/storage/processed
environment:
- PYTHONUNBUFFERED=1
restart: unless-stopped
healthcheck:
test: ["CMD", "curl", "-f", "http://localhost:8000/health"]
interval: 30s
timeout: 10s
retries: 3
start_period: 60s
networks:
- app-network
# Backend API Service
backend-api:
build:
@ -40,9 +63,11 @@ services:
- CELERY_BROKER_URL=redis://redis:6379/0
- CELERY_RESULT_BACKEND=redis://redis:6379/0
- MINERU_API_URL=http://mineru-api:8000
- MAGICDOC_API_URL=http://magicdoc-api:8000
depends_on:
- redis
- mineru-api
- magicdoc-api
networks:
- app-network
@ -60,6 +85,7 @@ services:
- CELERY_BROKER_URL=redis://redis:6379/0
- CELERY_RESULT_BACKEND=redis://redis:6379/0
- MINERU_API_URL=http://mineru-api:8000
- MAGICDOC_API_URL=http://magicdoc-api:8000
depends_on:
- redis
- backend-api

38
magicdoc/Dockerfile Normal file
View File

@ -0,0 +1,38 @@
FROM python:3.10-slim
WORKDIR /app
# Install system dependencies including LibreOffice
RUN apt-get update && apt-get install -y \
build-essential \
libreoffice \
libreoffice-writer \
libreoffice-calc \
libreoffice-impress \
wget \
curl \
&& rm -rf /var/lib/apt/lists/*
# Copy requirements and install Python packages first
COPY requirements.txt .
RUN pip install --upgrade pip
RUN pip install --no-cache-dir -r requirements.txt
# Install fairy-doc after numpy and opencv are installed
RUN pip install --no-cache-dir "fairy-doc[cpu]"
# Copy the application code
COPY app/ ./app/
# Create storage directories
RUN mkdir -p storage/uploads storage/processed
# Expose the port the app runs on
EXPOSE 8000
# Health check
HEALTHCHECK --interval=30s --timeout=10s --start-period=60s --retries=3 \
CMD curl -f http://localhost:8000/health || exit 1
# Command to run the application
CMD ["uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "8000"]

94
magicdoc/README.md Normal file
View File

@ -0,0 +1,94 @@
# MagicDoc API Service
A FastAPI service that provides document to markdown conversion using the Magic-Doc library. This service is designed to be compatible with the existing Mineru API interface.
## Features
- Converts DOC, DOCX, PPT, PPTX, and PDF files to markdown
- RESTful API interface compatible with Mineru API
- Docker containerization with LibreOffice dependencies
- Health check endpoint
- File upload support
## API Endpoints
### Health Check
```
GET /health
```
Returns service health status.
### File Parse
```
POST /file_parse
```
Converts uploaded document to markdown.
**Parameters:**
- `files`: File upload (required)
- `output_dir`: Output directory (default: "./output")
- `lang_list`: Language list (default: "ch")
- `backend`: Backend type (default: "pipeline")
- `parse_method`: Parse method (default: "auto")
- `formula_enable`: Enable formula processing (default: true)
- `table_enable`: Enable table processing (default: true)
- `return_md`: Return markdown (default: true)
- `return_middle_json`: Return middle JSON (default: false)
- `return_model_output`: Return model output (default: false)
- `return_content_list`: Return content list (default: false)
- `return_images`: Return images (default: false)
- `start_page_id`: Start page ID (default: 0)
- `end_page_id`: End page ID (default: 99999)
**Response:**
```json
{
"markdown": "converted markdown content",
"md": "converted markdown content",
"content": "converted markdown content",
"text": "converted markdown content",
"time_cost": 1.23,
"filename": "document.docx",
"status": "success"
}
```
## Running with Docker
### Build and run with docker-compose
```bash
cd magicdoc
docker-compose up --build
```
The service will be available at `http://localhost:8002`
### Build and run with Docker
```bash
cd magicdoc
docker build -t magicdoc-api .
docker run -p 8002:8000 magicdoc-api
```
## Integration with Document Processors
This service is designed to be compatible with the existing document processors. To use it instead of Mineru API, update the configuration in your document processors:
```python
# In docx_processor.py or pdf_processor.py
self.magicdoc_base_url = getattr(settings, 'MAGICDOC_API_URL', 'http://magicdoc-api:8000')
```
## Dependencies
- Python 3.10
- LibreOffice (installed in Docker container)
- Magic-Doc library
- FastAPI
- Uvicorn
## Storage
The service creates the following directories:
- `storage/uploads/`: For uploaded files
- `storage/processed/`: For processed files

152
magicdoc/SETUP.md Normal file
View File

@ -0,0 +1,152 @@
# MagicDoc Service Setup Guide
This guide explains how to set up and use the MagicDoc API service as an alternative to the Mineru API for document processing.
## Overview
The MagicDoc service provides a FastAPI-based REST API that converts various document formats (DOC, DOCX, PPT, PPTX, PDF) to markdown using the Magic-Doc library. It's designed to be compatible with your existing document processors.
## Quick Start
### 1. Build and Run the Service
```bash
cd magicdoc
./start.sh
```
Or manually:
```bash
cd magicdoc
docker-compose up --build -d
```
### 2. Verify the Service
```bash
# Check health
curl http://localhost:8002/health
# View API documentation
open http://localhost:8002/docs
```
### 3. Test with Sample Files
```bash
cd magicdoc
python test_api.py
```
## API Compatibility
The MagicDoc API is designed to be compatible with your existing Mineru API interface:
### Endpoint: `POST /file_parse`
**Request Format:**
- File upload via multipart form data
- Same parameters as Mineru API (most are optional)
**Response Format:**
```json
{
"markdown": "converted content",
"md": "converted content",
"content": "converted content",
"text": "converted content",
"time_cost": 1.23,
"filename": "document.docx",
"status": "success"
}
```
## Integration with Existing Processors
To use MagicDoc instead of Mineru in your existing processors:
### 1. Update Configuration
Add to your settings:
```python
MAGICDOC_API_URL = "http://magicdoc-api:8000" # or http://localhost:8002
MAGICDOC_TIMEOUT = 300
```
### 2. Modify Processors
Replace Mineru API calls with MagicDoc API calls. See `integration_example.py` for detailed examples.
### 3. Update Docker Compose
Add the MagicDoc service to your main docker-compose.yml:
```yaml
services:
magicdoc-api:
build:
context: ./magicdoc
dockerfile: Dockerfile
ports:
- "8002:8000"
volumes:
- ./magicdoc/storage:/app/storage
environment:
- PYTHONUNBUFFERED=1
restart: unless-stopped
```
## Service Architecture
```
magicdoc/
├── app/
│ ├── __init__.py
│ └── main.py # FastAPI application
├── Dockerfile # Container definition
├── docker-compose.yml # Service orchestration
├── requirements.txt # Python dependencies
├── README.md # Service documentation
├── SETUP.md # This setup guide
├── test_api.py # API testing script
├── integration_example.py # Integration examples
└── start.sh # Startup script
```
## Dependencies
- **Python 3.10**: Base runtime
- **LibreOffice**: Document processing (installed in container)
- **Magic-Doc**: Document conversion library
- **FastAPI**: Web framework
- **Uvicorn**: ASGI server
## Troubleshooting
### Service Won't Start
1. Check Docker is running
2. Verify port 8002 is available
3. Check logs: `docker-compose logs`
### File Conversion Fails
1. Verify LibreOffice is working in container
2. Check file format is supported
3. Review API logs for errors
### Integration Issues
1. Verify API endpoint URL
2. Check network connectivity between services
3. Ensure response format compatibility
## Performance Considerations
- MagicDoc is generally faster than Mineru for simple documents
- LibreOffice dependency adds container size
- Consider caching for repeated conversions
- Monitor memory usage for large files
## Security Notes
- Service runs on internal network
- File uploads are temporary
- No persistent storage of uploaded files
- Consider adding authentication for production use

1
magicdoc/app/__init__.py Normal file
View File

@ -0,0 +1 @@
# MagicDoc FastAPI Application

96
magicdoc/app/main.py Normal file
View File

@ -0,0 +1,96 @@
import os
import logging
from typing import Dict, Any, Optional
from fastapi import FastAPI, File, UploadFile, Form, HTTPException
from fastapi.responses import JSONResponse
from magic_doc.docconv import DocConverter, S3Config
import tempfile
import shutil
# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
app = FastAPI(title="MagicDoc API", version="1.0.0")
# Global converter instance
converter = DocConverter(s3_config=None)
@app.get("/health")
async def health_check():
"""Health check endpoint"""
return {"status": "healthy", "service": "magicdoc-api"}
@app.post("/file_parse")
async def parse_file(
files: UploadFile = File(...),
output_dir: str = Form("./output"),
lang_list: str = Form("ch"),
backend: str = Form("pipeline"),
parse_method: str = Form("auto"),
formula_enable: bool = Form(True),
table_enable: bool = Form(True),
return_md: bool = Form(True),
return_middle_json: bool = Form(False),
return_model_output: bool = Form(False),
return_content_list: bool = Form(False),
return_images: bool = Form(False),
start_page_id: int = Form(0),
end_page_id: int = Form(99999)
):
"""
Parse document file and convert to markdown
Compatible with Mineru API interface
"""
try:
logger.info(f"Processing file: {files.filename}")
# Create temporary file to save uploaded content
with tempfile.NamedTemporaryFile(delete=False, suffix=os.path.splitext(files.filename)[1]) as temp_file:
shutil.copyfileobj(files.file, temp_file)
temp_file_path = temp_file.name
try:
# Convert file to markdown using magic-doc
markdown_content, time_cost = converter.convert(temp_file_path, conv_timeout=300)
logger.info(f"Successfully converted {files.filename} to markdown in {time_cost:.2f}s")
# Return response compatible with Mineru API
response = {
"markdown": markdown_content,
"md": markdown_content, # Alternative field name
"content": markdown_content, # Alternative field name
"text": markdown_content, # Alternative field name
"time_cost": time_cost,
"filename": files.filename,
"status": "success"
}
return JSONResponse(content=response)
finally:
# Clean up temporary file
if os.path.exists(temp_file_path):
os.unlink(temp_file_path)
except Exception as e:
logger.error(f"Error processing file {files.filename}: {str(e)}")
raise HTTPException(status_code=500, detail=f"Error processing file: {str(e)}")
@app.get("/")
async def root():
"""Root endpoint with service information"""
return {
"service": "MagicDoc API",
"version": "1.0.0",
"description": "Document to Markdown conversion service using Magic-Doc",
"endpoints": {
"health": "/health",
"file_parse": "/file_parse"
}
}
if __name__ == "__main__":
import uvicorn
uvicorn.run(app, host="0.0.0.0", port=8000)

View File

@ -0,0 +1,26 @@
version: '3.8'
services:
magicdoc-api:
build:
context: .
dockerfile: Dockerfile
platform: linux/amd64
ports:
- "8002:8000"
volumes:
- ./storage/uploads:/app/storage/uploads
- ./storage/processed:/app/storage/processed
environment:
- PYTHONUNBUFFERED=1
restart: unless-stopped
healthcheck:
test: ["CMD", "curl", "-f", "http://localhost:8000/health"]
interval: 30s
timeout: 10s
retries: 3
start_period: 60s
volumes:
uploads:
processed:

View File

@ -0,0 +1,144 @@
"""
Example of how to integrate MagicDoc API with existing document processors
"""
# Example modification for docx_processor.py
# Replace the Mineru API configuration with MagicDoc API configuration
class DocxDocumentProcessor(DocumentProcessor):
def __init__(self, input_path: str, output_path: str):
super().__init__()
self.input_path = input_path
self.output_path = output_path
self.output_dir = os.path.dirname(output_path)
self.name_without_suff = os.path.splitext(os.path.basename(input_path))[0]
# Setup work directory for temporary files
self.work_dir = os.path.join(
os.path.dirname(output_path),
".work",
os.path.splitext(os.path.basename(input_path))[0]
)
os.makedirs(self.work_dir, exist_ok=True)
self.ollama_client = OllamaClient(model_name=settings.OLLAMA_MODEL, base_url=settings.OLLAMA_API_URL)
# MagicDoc API configuration (instead of Mineru)
self.magicdoc_base_url = getattr(settings, 'MAGICDOC_API_URL', 'http://magicdoc-api:8000')
self.magicdoc_timeout = getattr(settings, 'MAGICDOC_TIMEOUT', 300) # 5 minutes timeout
def _call_magicdoc_api(self, file_path: str) -> Optional[Dict[str, Any]]:
"""
Call MagicDoc API to convert DOCX to markdown
Args:
file_path: Path to the DOCX file
Returns:
API response as dictionary or None if failed
"""
try:
url = f"{self.magicdoc_base_url}/file_parse"
with open(file_path, 'rb') as file:
files = {'files': (os.path.basename(file_path), file, 'application/vnd.openxmlformats-officedocument.wordprocessingml.document')}
# Prepare form data - simplified compared to Mineru
data = {
'output_dir': './output',
'lang_list': 'ch',
'backend': 'pipeline',
'parse_method': 'auto',
'formula_enable': True,
'table_enable': True,
'return_md': True,
'return_middle_json': False,
'return_model_output': False,
'return_content_list': False,
'return_images': False,
'start_page_id': 0,
'end_page_id': 99999
}
logger.info(f"Calling MagicDoc API for DOCX processing at {url}")
response = requests.post(
url,
files=files,
data=data,
timeout=self.magicdoc_timeout
)
if response.status_code == 200:
result = response.json()
logger.info("Successfully received response from MagicDoc API for DOCX")
return result
else:
error_msg = f"MagicDoc API returned status code {response.status_code}: {response.text}"
logger.error(error_msg)
raise Exception(error_msg)
except requests.exceptions.Timeout:
error_msg = f"MagicDoc API request timed out after {self.magicdoc_timeout} seconds"
logger.error(error_msg)
raise Exception(error_msg)
except requests.exceptions.RequestException as e:
error_msg = f"Error calling MagicDoc API for DOCX: {str(e)}"
logger.error(error_msg)
raise Exception(error_msg)
except Exception as e:
error_msg = f"Unexpected error calling MagicDoc API for DOCX: {str(e)}"
logger.error(error_msg)
raise Exception(error_msg)
def read_content(self) -> str:
logger.info("Starting DOCX content processing with MagicDoc API")
# Call MagicDoc API to convert DOCX to markdown
magicdoc_response = self._call_magicdoc_api(self.input_path)
# Extract markdown content from the response
markdown_content = self._extract_markdown_from_response(magicdoc_response)
if not markdown_content:
raise Exception("No markdown content found in MagicDoc API response for DOCX")
logger.info(f"Successfully extracted {len(markdown_content)} characters of markdown content from DOCX")
# Save the raw markdown content to work directory for reference
md_output_path = os.path.join(self.work_dir, f"{self.name_without_suff}.md")
with open(md_output_path, 'w', encoding='utf-8') as file:
file.write(markdown_content)
logger.info(f"Saved raw markdown content from DOCX to {md_output_path}")
return markdown_content
# Configuration changes needed in settings.py:
"""
# Add these settings to your configuration
MAGICDOC_API_URL = "http://magicdoc-api:8000" # or http://localhost:8002 for local development
MAGICDOC_TIMEOUT = 300 # 5 minutes timeout
"""
# Docker Compose integration:
"""
# Add to your main docker-compose.yml
services:
magicdoc-api:
build:
context: ./magicdoc
dockerfile: Dockerfile
ports:
- "8002:8000"
volumes:
- ./magicdoc/storage:/app/storage
environment:
- PYTHONUNBUFFERED=1
restart: unless-stopped
healthcheck:
test: ["CMD", "curl", "-f", "http://localhost:8000/health"]
interval: 30s
timeout: 10s
retries: 3
start_period: 60s
"""

View File

@ -0,0 +1,7 @@
fastapi==0.104.1
uvicorn[standard]==0.24.0
python-multipart==0.0.6
# fairy-doc[cpu]==0.1.0
pydantic==2.5.0
numpy==1.24.3
opencv-python==4.8.1.78

34
magicdoc/start.sh Executable file
View File

@ -0,0 +1,34 @@
#!/bin/bash
# MagicDoc API Service Startup Script
echo "Starting MagicDoc API Service..."
# Check if Docker is running
if ! docker info > /dev/null 2>&1; then
echo "Error: Docker is not running. Please start Docker first."
exit 1
fi
# Build and start the service
echo "Building and starting MagicDoc API service..."
docker-compose up --build -d
# Wait for service to be ready
echo "Waiting for service to be ready..."
sleep 10
# Check health
echo "Checking service health..."
if curl -f http://localhost:8002/health > /dev/null 2>&1; then
echo "✅ MagicDoc API service is running successfully!"
echo "🌐 Service URL: http://localhost:8002"
echo "📖 API Documentation: http://localhost:8002/docs"
echo "🔍 Health Check: http://localhost:8002/health"
else
echo "❌ Service health check failed. Check logs with: docker-compose logs"
fi
echo ""
echo "To stop the service, run: docker-compose down"
echo "To view logs, run: docker-compose logs -f"

92
magicdoc/test_api.py Normal file
View File

@ -0,0 +1,92 @@
#!/usr/bin/env python3
"""
Test script for MagicDoc API
"""
import requests
import json
import os
def test_health_check(base_url="http://localhost:8002"):
"""Test health check endpoint"""
try:
response = requests.get(f"{base_url}/health")
print(f"Health check status: {response.status_code}")
print(f"Response: {response.json()}")
return response.status_code == 200
except Exception as e:
print(f"Health check failed: {e}")
return False
def test_file_parse(base_url="http://localhost:8002", file_path=None):
"""Test file parse endpoint"""
if not file_path or not os.path.exists(file_path):
print(f"File not found: {file_path}")
return False
try:
with open(file_path, 'rb') as f:
files = {'files': (os.path.basename(file_path), f, 'application/octet-stream')}
data = {
'output_dir': './output',
'lang_list': 'ch',
'backend': 'pipeline',
'parse_method': 'auto',
'formula_enable': True,
'table_enable': True,
'return_md': True,
'return_middle_json': False,
'return_model_output': False,
'return_content_list': False,
'return_images': False,
'start_page_id': 0,
'end_page_id': 99999
}
response = requests.post(f"{base_url}/file_parse", files=files, data=data)
print(f"File parse status: {response.status_code}")
if response.status_code == 200:
result = response.json()
print(f"Success! Converted {len(result.get('markdown', ''))} characters")
print(f"Time cost: {result.get('time_cost', 'N/A')}s")
return True
else:
print(f"Error: {response.text}")
return False
except Exception as e:
print(f"File parse failed: {e}")
return False
def main():
"""Main test function"""
print("Testing MagicDoc API...")
# Test health check
print("\n1. Testing health check...")
if not test_health_check():
print("Health check failed. Make sure the service is running.")
return
# Test file parse (if sample file exists)
print("\n2. Testing file parse...")
sample_files = [
"../sample_doc/20220707_na_decision-2.docx",
"../sample_doc/20220707_na_decision-2.pdf",
"../sample_doc/short_doc.md"
]
for sample_file in sample_files:
if os.path.exists(sample_file):
print(f"Testing with {sample_file}...")
if test_file_parse(file_path=sample_file):
print("File parse test passed!")
break
else:
print(f"Sample file not found: {sample_file}")
print("\nTest completed!")
if __name__ == "__main__":
main()