feat: 正式fully支持docx

fix: 解决magic-doc包的问题
feat：新增magicdoc
2025-08-18 01:15:40 +08:00 · 2025-08-18 01:01:58 +08:00 · 2025-08-18 00:40:39 +08:00 · 2025-08-17 23:33:56 +08:00
19 changed files with 747 additions and 195 deletions
--- a/backend/app/core/document_handlers/processors/docx_processor.py
+++ b/backend/app/core/document_handlers/processors/docx_processor.py
@ -26,18 +26,19 @@ class DocxDocumentProcessor(DocumentProcessor):
        
        self.ollama_client = OllamaClient(model_name=settings.OLLAMA_MODEL, base_url=settings.OLLAMA_API_URL)
        
-        # Mineru API configuration
-        self.mineru_base_url = getattr(settings, 'MINERU_API_URL', 'http://mineru-api:8000')
-        self.mineru_timeout = getattr(settings, 'MINERU_TIMEOUT', 300)  # 5 minutes timeout
-        self.mineru_lang_list = getattr(settings, 'MINERU_LANG_LIST', ['ch'])
-        self.mineru_backend = getattr(settings, 'MINERU_BACKEND', 'pipeline')
-        self.mineru_parse_method = getattr(settings, 'MINERU_PARSE_METHOD', 'auto')
-        self.mineru_formula_enable = getattr(settings, 'MINERU_FORMULA_ENABLE', True)
-        self.mineru_table_enable = getattr(settings, 'MINERU_TABLE_ENABLE', True)
+        # MagicDoc API configuration (replacing Mineru)
+        self.magicdoc_base_url = getattr(settings, 'MAGICDOC_API_URL', 'http://magicdoc-api:8000')
+        self.magicdoc_timeout = getattr(settings, 'MAGICDOC_TIMEOUT', 300)  # 5 minutes timeout
+        # MagicDoc uses simpler parameters, but we keep compatibility with existing interface
+        self.magicdoc_lang_list = getattr(settings, 'MAGICDOC_LANG_LIST', 'ch')
+        self.magicdoc_backend = getattr(settings, 'MAGICDOC_BACKEND', 'pipeline')
+        self.magicdoc_parse_method = getattr(settings, 'MAGICDOC_PARSE_METHOD', 'auto')
+        self.magicdoc_formula_enable = getattr(settings, 'MAGICDOC_FORMULA_ENABLE', True)
+        self.magicdoc_table_enable = getattr(settings, 'MAGICDOC_TABLE_ENABLE', True)

-    def _call_mineru_api(self, file_path: str) -> Optional[Dict[str, Any]]:
+    def _call_magicdoc_api(self, file_path: str) -> Optional[Dict[str, Any]]:
        """
-        Call Mineru API to convert DOCX to markdown
+        Call MagicDoc API to convert DOCX to markdown
        
        Args:
            file_path: Path to the DOCX file
@ -46,19 +47,19 @@ class DocxDocumentProcessor(DocumentProcessor):
            API response as dictionary or None if failed
        """
        try:
-            url = f"{self.mineru_base_url}/file_parse"
+            url = f"{self.magicdoc_base_url}/file_parse"
            
            with open(file_path, 'rb') as file:
                files = {'files': (os.path.basename(file_path), file, 'application/vnd.openxmlformats-officedocument.wordprocessingml.document')}
                
-                # Prepare form data according to Mineru API specification
+                # Prepare form data according to MagicDoc API specification (compatible with Mineru)
                data = {
                    'output_dir': './output',
-                    'lang_list': self.mineru_lang_list,
-                    'backend': self.mineru_backend,
-                    'parse_method': self.mineru_parse_method,
-                    'formula_enable': self.mineru_formula_enable,
-                    'table_enable': self.mineru_table_enable,
+                    'lang_list': self.magicdoc_lang_list,
+                    'backend': self.magicdoc_backend,
+                    'parse_method': self.magicdoc_parse_method,
+                    'formula_enable': self.magicdoc_formula_enable,
+                    'table_enable': self.magicdoc_table_enable,
                    'return_md': True,
                    'return_middle_json': False,
                    'return_model_output': False,
@ -68,58 +69,58 @@ class DocxDocumentProcessor(DocumentProcessor):
                    'end_page_id': 99999
                }
                
-                logger.info(f"Calling Mineru API for DOCX processing at {url}")
+                logger.info(f"Calling MagicDoc API for DOCX processing at {url}")
                response = requests.post(
                    url, 
                    files=files,
                    data=data,
-                    timeout=self.mineru_timeout
+                    timeout=self.magicdoc_timeout
                )
                
                if response.status_code == 200:
                    result = response.json()
-                    logger.info("Successfully received response from Mineru API for DOCX")
+                    logger.info("Successfully received response from MagicDoc API for DOCX")
                    return result
                else:
-                    error_msg = f"Mineru API returned status code {response.status_code}: {response.text}"
+                    error_msg = f"MagicDoc API returned status code {response.status_code}: {response.text}"
                    logger.error(error_msg)
                    # For 400 errors, include more specific information
                    if response.status_code == 400:
                        try:
                            error_data = response.json()
                            if 'error' in error_data:
-                                error_msg = f"Mineru API error: {error_data['error']}"
+                                error_msg = f"MagicDoc API error: {error_data['error']}"
                        except:
                            pass
                    raise Exception(error_msg)
                    
        except requests.exceptions.Timeout:
-            error_msg = f"Mineru API request timed out after {self.mineru_timeout} seconds"
+            error_msg = f"MagicDoc API request timed out after {self.magicdoc_timeout} seconds"
            logger.error(error_msg)
            raise Exception(error_msg)
        except requests.exceptions.RequestException as e:
-            error_msg = f"Error calling Mineru API for DOCX: {str(e)}"
+            error_msg = f"Error calling MagicDoc API for DOCX: {str(e)}"
            logger.error(error_msg)
            raise Exception(error_msg)
        except Exception as e:
-            error_msg = f"Unexpected error calling Mineru API for DOCX: {str(e)}"
+            error_msg = f"Unexpected error calling MagicDoc API for DOCX: {str(e)}"
            logger.error(error_msg)
            raise Exception(error_msg)

    def _extract_markdown_from_response(self, response: Dict[str, Any]) -> str:
        """
-        Extract markdown content from Mineru API response
+        Extract markdown content from MagicDoc API response
        
        Args:
-            response: Mineru API response dictionary
+            response: MagicDoc API response dictionary
            
        Returns:
            Extracted markdown content as string
        """
        try:
-            logger.debug(f"Mineru API response structure for DOCX: {response}")
+            logger.debug(f"MagicDoc API response structure for DOCX: {response}")
            
-            # Try different possible response formats based on Mineru API
+            # Try different possible response formats based on MagicDoc API
            if 'markdown' in response:
                return response['markdown']
            elif 'md' in response:
@ -157,7 +158,7 @@ class DocxDocumentProcessor(DocumentProcessor):
                    return first_item
            else:
                # If no standard format found, try to extract from the response structure
-                logger.warning("Could not find standard markdown field in Mineru response for DOCX")
+                logger.warning("Could not find standard markdown field in MagicDoc response for DOCX")
                
                # Return the response as string if it's simple, or empty string
                if isinstance(response, str):
@ -176,21 +177,21 @@ class DocxDocumentProcessor(DocumentProcessor):
                return ""
                
        except Exception as e:
-            logger.error(f"Error extracting markdown from Mineru response for DOCX: {str(e)}")
+            logger.error(f"Error extracting markdown from MagicDoc response for DOCX: {str(e)}")
            return ""

    def read_content(self) -> str:
-        logger.info("Starting DOCX content processing with Mineru API")
+        logger.info("Starting DOCX content processing with MagicDoc API")
        
-        # Call Mineru API to convert DOCX to markdown
+        # Call MagicDoc API to convert DOCX to markdown
        # This will raise an exception if the API call fails
-        mineru_response = self._call_mineru_api(self.input_path)
+        magicdoc_response = self._call_magicdoc_api(self.input_path)
        
        # Extract markdown content from the response
-        markdown_content = self._extract_markdown_from_response(mineru_response)
+        markdown_content = self._extract_markdown_from_response(magicdoc_response)
        
        if not markdown_content:
-            raise Exception("No markdown content found in Mineru API response for DOCX")
+            raise Exception("No markdown content found in MagicDoc API response for DOCX")
        
        logger.info(f"Successfully extracted {len(markdown_content)} characters of markdown content from DOCX")
        
--- a/backend/docs/OLLAMA_CLIENT_ENHANCEMENT.md
+++ b/backend/docs/OLLAMA_CLIENT_ENHANCEMENT.md
--- a/backend/docs/PDF_PROCESSOR_README.md
+++ b/backend/docs/PDF_PROCESSOR_README.md
--- a/backend/docs/REFACTORING_SUMMARY.md
+++ b/backend/docs/REFACTORING_SUMMARY.md
--- a/backend/docs/TEST_SETUP.md
+++ b/backend/docs/TEST_SETUP.md
--- a/backend/log
+++ b/backend/log
@ -1,127 +0,0 @@
- [2025-07-14 14:20:19,015: INFO/ForkPoolWorker-4] Raw response from LLM: {
-celery_worker-1  |   "entities": []
-celery_worker-1  | }
-celery_worker-1  | [2025-07-14 14:20:19,016: INFO/ForkPoolWorker-4] Parsed mapping: {'entities': []}
-celery_worker-1  | [2025-07-14 14:20:19,020: INFO/ForkPoolWorker-4] Calling ollama to generate case numbers mapping for chunk (attempt 1/3): 
-celery_worker-1  | 你是一个专业的法律文本实体识别助手。请从以下文本中抽取出所有需要脱敏的敏感信息，并按照指定的类别进行分类。请严格按照JSON格式输出结果。
-celery_worker-1  | 
-celery_worker-1  | 实体类别包括:
-celery_worker-1  | - 案号
-celery_worker-1  | 
-celery_worker-1  | 待处理文本:
-celery_worker-1  |   
-celery_worker-1  | 
-celery_worker-1  | 二审案件受理费450892 元，由北京丰复久信营销科技有限公司负担（已交纳）。  
-celery_worker-1  | 
-celery_worker-1  | 29. 本判决为终审判决。  
-celery_worker-1  | 
-celery_worker-1  | 审 判 长 史晓霞审 判 员 邓青菁审 判 员 李 淼二〇二二年七月七日法 官 助 理 黎 铧书 记 员 郑海兴    
-celery_worker-1  | 
-celery_worker-1  | 输出格式:
-celery_worker-1  | {
-celery_worker-1  | "entities": [
-celery_worker-1  |     {"text": "原始文本内容", "type": "案号"},
-celery_worker-1  |     ...
-celery_worker-1  |   ]
-celery_worker-1  | }
-celery_worker-1  | 
-celery_worker-1  | 请严格按照JSON格式输出结果。
-celery_worker-1  | 
-api-1            | INFO:     192.168.65.1:60045 - "GET /api/v1/files/files HTTP/1.1" 200 OK
-api-1            | INFO:     192.168.65.1:34054 - "GET /api/v1/files/files HTTP/1.1" 200 OK
-api-1            | INFO:     192.168.65.1:34054 - "GET /api/v1/files/files HTTP/1.1" 200 OK
-api-1            | INFO:     192.168.65.1:22084 - "GET /api/v1/files/files HTTP/1.1" 200 OK
-celery_worker-1  | [2025-07-14 14:20:31,279: INFO/ForkPoolWorker-4] Raw response from LLM: {
-celery_worker-1  |   "entities": []
-celery_worker-1  | }
-celery_worker-1  | [2025-07-14 14:20:31,281: INFO/ForkPoolWorker-4] Parsed mapping: {'entities': []}
-celery_worker-1  | [2025-07-14 14:20:31,287: INFO/ForkPoolWorker-4] Chunk mapping: [{'entities': []}, {'entities': [{'text': '北京丰复久信营销科技有限公司', 'type': '公司名称'}]}, {'entities': []}, {'entities': []}, {'entities': []}]
-celery_worker-1  | [2025-07-14 14:20:31,288: INFO/ForkPoolWorker-4] Final chunk mappings: [{'entities': [{'text': '郭东军', 'type': '人名'}, {'text': '王欢子', 'type': '人名'}]}, {'entities': [{'text': '北京丰复久信营销科技有限公司', 'type': '公司名称'}, {'text': '丰复久信公司', 'type': '公司名称简称'}, {'text': '中研智创区块链技术有限公司', 'type': '公司名称'}, {'text': '中研智才公司', 'type': '公司名称简称'}]}, {'entities': [{'text': '北京市海淀区北小马厂6 号1 号楼华天大厦1306 室', 'type': '地址'}, {'text': '天津市津南区双港镇工业园区优谷产业园5 号楼-1505', 'type': '地址'}]}, {'entities': [{'text': '服务合同', 'type': '项目名'}]}, {'entities': [{'text': '(2022)京 03 民终 3852 号', 'type': '案号'}, {'text': '（2020）京0105 民初69754 号', 'type': '案号'}]}, {'entities': [{'text': '李圣艳', 'type': '人名'}, {'text': '闫向东', 'type': '人名'}, {'text': '李敏', 'type': '人名'}, {'text': '布兰登·斯密特', 'type': '英文人名'}]}, {'entities': [{'text': '丰复久信公司', 'type': '公司名称'}, {'text': '中研智创公司', 'type': '公司名称'}, {'text': '丰复久信', 'type': '公司名称简称'}, {'text': '中研智创', 'type': '公司名称简称'}]}, {'entities': [{'text': '上海市', 'type': '地址'}, {'text': '北京', 'type': '地址'}]}, {'entities': [{'text': '《计算机设备采购合同》', 'type': '项目名'}]}, {'entities': []}, {'entities': []}, {'entities': [{'text': '丰复久信公司', 'type': '公司名称'}, {'text': '中研智创公司', 'type': '公司名称'}]}, {'entities': []}, {'entities': [{'text': '《服务合同书》', 'type': '项目名'}]}, {'entities': []}, {'entities': []}, {'entities': [{'text': '北京丰复久信营销科技有限公司', 'type': '公司名称'}]}, {'entities': []}, {'entities': []}, {'entities': []}]
-celery_worker-1  | [2025-07-14 14:20:31,288: INFO/ForkPoolWorker-4] Duplicate entity found: {'text': '丰复久信公司', 'type': '公司名称'}
-celery_worker-1  | [2025-07-14 14:20:31,288: INFO/ForkPoolWorker-4] Duplicate entity found: {'text': '丰复久信公司', 'type': '公司名称'}
-celery_worker-1  | [2025-07-14 14:20:31,288: INFO/ForkPoolWorker-4] Duplicate entity found: {'text': '中研智创公司', 'type': '公司名称'}
-celery_worker-1  | [2025-07-14 14:20:31,288: INFO/ForkPoolWorker-4] Duplicate entity found: {'text': '北京丰复久信营销科技有限公司', 'type': '公司名称'}
-celery_worker-1  | [2025-07-14 14:20:31,288: INFO/ForkPoolWorker-4] Merged 22 unique entities
-celery_worker-1  | [2025-07-14 14:20:31,288: INFO/ForkPoolWorker-4] Unique entities: [{'text': '郭东军', 'type': '人名'}, {'text': '王欢子', 'type': '人名'}, {'text': '北京丰复久信营销科技有限公司', 'type': '公司名称'}, {'text': '丰复久信公司', 'type': '公司名称简称'}, {'text': '中研智创区块链技术有限公司', 'type': '公司名称'}, {'text': '中研智才公司', 'type': '公司名称简称'}, {'text': '北京市海淀区北小马厂6 号1 号楼华天大厦1306 室', 'type': '地址'}, {'text': '天津市津南区双港镇工业园区优谷产业园5 号楼-1505', 'type': '地址'}, {'text': '服务合同', 'type': '项目名'}, {'text': '(2022)京 03 民终 3852 号', 'type': '案号'}, {'text': '（2020）京0105 民初69754 号', 'type': '案号'}, {'text': '李圣艳', 'type': '人名'}, {'text': '闫向东', 'type': '人名'}, {'text': '李敏', 'type': '人名'}, {'text': '布兰登·斯密特', 'type': '英文人名'}, {'text': '中研智创公司', 'type': '公司名称'}, {'text': '丰复久信', 'type': '公司名称简称'}, {'text': '中研智创', 'type': '公司名称简称'}, {'text': '上海市', 'type': '地址'}, {'text': '北京', 'type': '地址'}, {'text': '《计算机设备采购合同》', 'type': '项目名'}, {'text': '《服务合同书》', 'type': '项目名'}]
-celery_worker-1  | [2025-07-14 14:20:31,289: INFO/ForkPoolWorker-4] Calling ollama to generate entity linkage (attempt 1/3)
-api-1            | INFO:     192.168.65.1:52168 - "GET /api/v1/files/files HTTP/1.1" 200 OK
-api-1            | INFO:     192.168.65.1:61426 - "GET /api/v1/files/files HTTP/1.1" 200 OK
-api-1            | INFO:     192.168.65.1:30702 - "GET /api/v1/files/files HTTP/1.1" 200 OK
-api-1            | INFO:     192.168.65.1:48159 - "GET /api/v1/files/files HTTP/1.1" 200 OK
-api-1            | INFO:     192.168.65.1:16860 - "GET /api/v1/files/files HTTP/1.1" 200 OK
-api-1            | INFO:     192.168.65.1:21262 - "GET /api/v1/files/files HTTP/1.1" 200 OK
-api-1            | INFO:     192.168.65.1:45564 - "GET /api/v1/files/files HTTP/1.1" 200 OK
-api-1            | INFO:     192.168.65.1:32142 - "GET /api/v1/files/files HTTP/1.1" 200 OK
-api-1            | INFO:     192.168.65.1:27769 - "GET /api/v1/files/files HTTP/1.1" 200 OK
-api-1            | INFO:     192.168.65.1:21196 - "GET /api/v1/files/files HTTP/1.1" 200 OK
-celery_worker-1  | [2025-07-14 14:21:21,436: INFO/ForkPoolWorker-4] Raw entity linkage response from LLM: {
-celery_worker-1  |   "entity_groups": [
-celery_worker-1  |     {
-celery_worker-1  |       "group_id": "group_1",
-celery_worker-1  |       "group_type": "公司名称",
-celery_worker-1  |       "entities": [
-celery_worker-1  |         {
-celery_worker-1  |           "text": "北京丰复久信营销科技有限公司",
-celery_worker-1  |           "type": "公司名称",
-celery_worker-1  |           "is_primary": true
-celery_worker-1  |         },
-celery_worker-1  |         {
-celery_worker-1  |           "text": "丰复久信公司",
-celery_worker-1  |           "type": "公司名称简称",
-celery_worker-1  |           "is_primary": false
-celery_worker-1  |         },
-celery_worker-1  |         {
-celery_worker-1  |           "text": "丰复久信",
-celery_worker-1  |           "type": "公司名称简称",
-celery_worker-1  |           "is_primary": false
-celery_worker-1  |         }
-celery_worker-1  |       ]
-celery_worker-1  |     },
-celery_worker-1  |     {
-celery_worker-1  |       "group_id": "group_2",
-celery_worker-1  |       "group_type": "公司名称",
-celery_worker-1  |       "entities": [
-celery_worker-1  |         {
-celery_worker-1  |           "text": "中研智创区块链技术有限公司",
-celery_worker-1  |           "type": "公司名称",
-celery_worker-1  |           "is_primary": true
-celery_worker-1  |         },
-celery_worker-1  |         {
-celery_worker-1  |           "text": "中研智创公司",
-celery_worker-1  |           "type": "公司名称简称",
-celery_worker-1  |           "is_primary": false
-celery_worker-1  |         },
-celery_worker-1  |         {
-celery_worker-1  |           "text": "中研智创",
-celery_worker-1  |           "type": "公司名称简称",
-celery_worker-1  |           "is_primary": false
-celery_worker-1  |         }
-celery_worker-1  |       ]
-celery_worker-1  |     }
-celery_worker-1  |   ]
-celery_worker-1  | }
-celery_worker-1  | [2025-07-14 14:21:21,437: INFO/ForkPoolWorker-4] Parsed entity linkage: {'entity_groups': [{'group_id': 'group_1', 'group_type': '公司名称', 'entities': [{'text': '北京丰复久信营销科技有限公司', 'type': '公司名称', 'is_primary': True}, {'text': '丰复久信公司', 'type': '公司名称简称', 'is_primary': False}, {'text': '丰复久信', 'type': '公司名称简称', 'is_primary': False}]}, {'group_id': 'group_2', 'group_type': '公司名称', 'entities': [{'text': '中研智创区块链技术有限公司', 'type': '公司名称', 'is_primary': True}, {'text': '中研智创公司', 'type': '公司名称简称', 'is_primary': False}, {'text': '中研智创', 'type': '公司名称简称', 'is_primary': False}]}]}
-celery_worker-1  | [2025-07-14 14:21:21,445: INFO/ForkPoolWorker-4] Successfully created entity linkage with 2 groups
-celery_worker-1  | [2025-07-14 14:21:21,445: INFO/ForkPoolWorker-4] Entity linkage: {'entity_groups': [{'group_id': 'group_1', 'group_type': '公司名称', 'entities': [{'text': '北京丰复久信营销科技有限公司', 'type': '公司名称', 'is_primary': True}, {'text': '丰复久信公司', 'type': '公司名称简称', 'is_primary': False}, {'text': '丰复久信', 'type': '公司名称简称', 'is_primary': False}]}, {'group_id': 'group_2', 'group_type': '公司名称', 'entities': [{'text': '中研智创区块链技术有限公司', 'type': '公司名称', 'is_primary': True}, {'text': '中研智创公司', 'type': '公司名称简称', 'is_primary': False}, {'text': '中研智创', 'type': '公司名称简称', 'is_primary': False}]}]}
-celery_worker-1  | [2025-07-14 14:21:21,446: INFO/ForkPoolWorker-4] Generated masked mapping for 22 entities
-celery_worker-1  | [2025-07-14 14:21:21,446: INFO/ForkPoolWorker-4] Combined mapping: {'郭东军': '某', '王欢子': '某甲', '北京丰复久信营销科技有限公司': '某公司', '丰复久信公司': '某公司甲', '中研智创区块链技术有限公司': '某公司乙', '中研智才公司': '某公司丙', '北京市海淀区北小马厂6 号1 号楼华天大厦1306 室': '某乙', '天津市津南区双港镇工业园区优谷产业园5 号楼-1505': '某丙', '服务合同': '某丁', '(2022)京 03 民终 3852 号': '某戊', '（2020）京0105 民初69754 号': '某己', '李圣艳': '某庚', '闫向东': '某辛', '李敏': '某壬', '布兰登·斯密特': '某癸', '中研智创公司': '某公司丁', '丰复久信': '某公司戊', '中研智创': '某公司己', '上海市': '某11', '北京': '某12', '《计算机设备采购合同》': '某13', '《服务合同书》': '某14'}
-celery_worker-1  | [2025-07-14 14:21:21,446: INFO/ForkPoolWorker-4] Linked entity '北京丰复久信营销科技有限公司' to '北京丰复久信营销科技有限公司' with masked name '某公司'
-celery_worker-1  | [2025-07-14 14:21:21,446: INFO/ForkPoolWorker-4] Linked entity '丰复久信公司' to '北京丰复久信营销科技有限公司' with masked name '某公司'
-celery_worker-1  | [2025-07-14 14:21:21,446: INFO/ForkPoolWorker-4] Linked entity '丰复久信' to '北京丰复久信营销科技有限公司' with masked name '某公司'
-celery_worker-1  | [2025-07-14 14:21:21,446: INFO/ForkPoolWorker-4] Linked entity '中研智创区块链技术有限公司' to '中研智创区块链技术有限公司' with masked name '某公司乙'
-celery_worker-1  | [2025-07-14 14:21:21,446: INFO/ForkPoolWorker-4] Linked entity '中研智创公司' to '中研智创区块链技术有限公司' with masked name '某公司乙'
-celery_worker-1  | [2025-07-14 14:21:21,446: INFO/ForkPoolWorker-4] Linked entity '中研智创' to '中研智创区块链技术有限公司' with masked name '某公司乙'
-celery_worker-1  | [2025-07-14 14:21:21,446: INFO/ForkPoolWorker-4] Final mapping: {'郭东军': '某', '王欢子': '某甲', '北京丰复久信营销科技有限公司': '某公司', '丰复久信公司': '某公司', '中研智创区块链技术有限公司': '某公司乙', '中研智才公司': '某公司丙', '北京市海淀区北小马厂6 号1 号楼华天大厦1306 室': '某乙', '天津市津南区双港镇工业园区优谷产业园5 号楼-1505': '某丙', '服务合同': '某丁', '(2022)京 03 民终 3852 号': '某戊', '（2020）京0105 民初69754 号': '某己', '李圣艳': '某庚', '闫向东': '某辛', '李敏': '某壬', '布兰登·斯密特': '某癸', '中研智创公司': '某公司乙', '丰复久信': '某公司', '中研智创': '某公司乙', '上海市': '某11', '北京': '某12', '《计算机设备采购合同》': '某13', '《服务合同书》': '某14'}
-celery_worker-1  | [2025-07-14 14:21:21,446: INFO/ForkPoolWorker-4] Successfully masked content
-celery_worker-1  | [2025-07-14 14:21:21,449: INFO/ForkPoolWorker-4] Successfully saved masked content to /app/storage/processed/47522ea9-c259-4304-bfe4-1d3ed6902ede.md
-celery_worker-1  | [2025-07-14 14:21:21,470: INFO/ForkPoolWorker-4] Task app.services.file_service.process_file[5cfbca4c-0f6f-4c71-a66b-b22ee2d28139] succeeded in 311.847165101s: None
-api-1            | INFO:     192.168.65.1:33432 - "GET /api/v1/files/files HTTP/1.1" 200 OK
-api-1            | INFO:     192.168.65.1:40073 - "GET /api/v1/files/files HTTP/1.1" 200 OK
-api-1            | INFO:     192.168.65.1:29550 - "GET /api/v1/files/files HTTP/1.1" 200 OK
-api-1            | INFO:     192.168.65.1:61350 - "GET /api/v1/files/files HTTP/1.1" 200 OK
-api-1            | INFO:     192.168.65.1:61755 - "GET /api/v1/files/files HTTP/1.1" 200 OK
-api-1            | INFO:     192.168.65.1:63726 - "GET /api/v1/files/files HTTP/1.1" 200 OK
-api-1            | INFO:     192.168.65.1:43446 - "GET /api/v1/files/files HTTP/1.1" 200 OK
-api-1            | INFO:     192.168.65.1:45624 - "GET /api/v1/files/files HTTP/1.1" 200 OK
-api-1            | INFO:     192.168.65.1:25256 - "GET /api/v1/files/files HTTP/1.1" 200 OK
-api-1            | INFO:     192.168.65.1:43464 - "GET /api/v1/files/files HTTP/1.1" 200 OK
--- a/backend/run_tests.py
+++ b/backend/run_tests.py
@ -1,32 +0,0 @@
-#!/usr/bin/env python3
-"""
-Simple test runner script to verify test discovery and execution
-"""
-import subprocess
-import sys
-import os
-from pathlib import Path
-
-def run_tests():
-    """Run pytest with proper configuration"""
-    # Change to backend directory
-    backend_dir = Path(__file__).parent
-    os.chdir(backend_dir)
-    
-    # Run pytest
-    cmd = [sys.executable, "-m", "pytest", "tests/", "-v", "--tb=short"]
-    
-    print(f"Running tests from: {backend_dir}")
-    print(f"Command: {' '.join(cmd)}")
-    print("-" * 50)
-    
-    try:
-        result = subprocess.run(cmd, capture_output=False, text=True)
-        return result.returncode
-    except Exception as e:
-        print(f"Error running tests: {e}")
-        return 1
-
-if __name__ == "__main__":
-    exit_code = run_tests()
-    sys.exit(exit_code)
--- a/backend/tests/test_enhanced_ollama_client.py
+++ b/backend/tests/test_enhanced_ollama_client.py
--- a/docker-compose.yml
+++ b/docker-compose.yml
@ -25,6 +25,29 @@ services:
    networks:
      - app-network

+  # MagicDoc API Service
+  magicdoc-api:
+    build:
+      context: ./magicdoc
+      dockerfile: Dockerfile
+    platform: linux/amd64
+    ports:
+      - "8002:8000"
+    volumes:
+      - ./magicdoc/storage/uploads:/app/storage/uploads
+      - ./magicdoc/storage/processed:/app/storage/processed
+    environment:
+      - PYTHONUNBUFFERED=1
+    restart: unless-stopped
+    healthcheck:
+      test: ["CMD", "curl", "-f", "http://localhost:8000/health"]
+      interval: 30s
+      timeout: 10s
+      retries: 3
+      start_period: 60s
+    networks:
+      - app-network
+
  # Backend API Service
  backend-api:
    build:
@ -40,9 +63,11 @@ services:
      - CELERY_BROKER_URL=redis://redis:6379/0
      - CELERY_RESULT_BACKEND=redis://redis:6379/0
      - MINERU_API_URL=http://mineru-api:8000
+      - MAGICDOC_API_URL=http://magicdoc-api:8000
    depends_on:
      - redis
      - mineru-api
+      - magicdoc-api
    networks:
      - app-network

@ -60,6 +85,7 @@ services:
      - CELERY_BROKER_URL=redis://redis:6379/0
      - CELERY_RESULT_BACKEND=redis://redis:6379/0
      - MINERU_API_URL=http://mineru-api:8000
+      - MAGICDOC_API_URL=http://magicdoc-api:8000
    depends_on:
      - redis
      - backend-api
--- a/magicdoc/Dockerfile
+++ b/magicdoc/Dockerfile
@ -0,0 +1,38 @@
+FROM python:3.10-slim
+
+WORKDIR /app
+
+# Install system dependencies including LibreOffice
+RUN apt-get update && apt-get install -y \
+    build-essential \
+    libreoffice \
+    libreoffice-writer \
+    libreoffice-calc \
+    libreoffice-impress \
+    wget \
+    curl \
+    && rm -rf /var/lib/apt/lists/*
+
+# Copy requirements and install Python packages first
+COPY requirements.txt .
+RUN pip install --upgrade pip
+RUN pip install --no-cache-dir -r requirements.txt
+
+# Install fairy-doc after numpy and opencv are installed
+RUN pip install --no-cache-dir "fairy-doc[cpu]"
+
+# Copy the application code
+COPY app/ ./app/
+
+# Create storage directories
+RUN mkdir -p storage/uploads storage/processed
+
+# Expose the port the app runs on
+EXPOSE 8000
+
+# Health check
+HEALTHCHECK --interval=30s --timeout=10s --start-period=60s --retries=3 \
+    CMD curl -f http://localhost:8000/health || exit 1
+
+# Command to run the application
+CMD ["uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "8000"]
--- a/magicdoc/README.md
+++ b/magicdoc/README.md
@ -0,0 +1,94 @@
+# MagicDoc API Service
+
+A FastAPI service that provides document to markdown conversion using the Magic-Doc library. This service is designed to be compatible with the existing Mineru API interface.
+
+## Features
+
+- Converts DOC, DOCX, PPT, PPTX, and PDF files to markdown
+- RESTful API interface compatible with Mineru API
+- Docker containerization with LibreOffice dependencies
+- Health check endpoint
+- File upload support
+
+## API Endpoints
+
+### Health Check
+```
+GET /health
+```
+Returns service health status.
+
+### File Parse
+```
+POST /file_parse
+```
+Converts uploaded document to markdown.
+
+**Parameters:**
+- `files`: File upload (required)
+- `output_dir`: Output directory (default: "./output")
+- `lang_list`: Language list (default: "ch")
+- `backend`: Backend type (default: "pipeline")
+- `parse_method`: Parse method (default: "auto")
+- `formula_enable`: Enable formula processing (default: true)
+- `table_enable`: Enable table processing (default: true)
+- `return_md`: Return markdown (default: true)
+- `return_middle_json`: Return middle JSON (default: false)
+- `return_model_output`: Return model output (default: false)
+- `return_content_list`: Return content list (default: false)
+- `return_images`: Return images (default: false)
+- `start_page_id`: Start page ID (default: 0)
+- `end_page_id`: End page ID (default: 99999)
+
+**Response:**
+```json
+{
+  "markdown": "converted markdown content",
+  "md": "converted markdown content",
+  "content": "converted markdown content",
+  "text": "converted markdown content",
+  "time_cost": 1.23,
+  "filename": "document.docx",
+  "status": "success"
+}
+```
+
+## Running with Docker
+
+### Build and run with docker-compose
+```bash
+cd magicdoc
+docker-compose up --build
+```
+
+The service will be available at `http://localhost:8002`
+
+### Build and run with Docker
+```bash
+cd magicdoc
+docker build -t magicdoc-api .
+docker run -p 8002:8000 magicdoc-api
+```
+
+## Integration with Document Processors
+
+This service is designed to be compatible with the existing document processors. To use it instead of Mineru API, update the configuration in your document processors:
+
+```python
+# In docx_processor.py or pdf_processor.py
+self.magicdoc_base_url = getattr(settings, 'MAGICDOC_API_URL', 'http://magicdoc-api:8000')
+```
+
+## Dependencies
+
+- Python 3.10
+- LibreOffice (installed in Docker container)
+- Magic-Doc library
+- FastAPI
+- Uvicorn
+
+## Storage
+
+The service creates the following directories:
+- `storage/uploads/`: For uploaded files
+- `storage/processed/`: For processed files
--- a/magicdoc/SETUP.md
+++ b/magicdoc/SETUP.md
@ -0,0 +1,152 @@
+# MagicDoc Service Setup Guide
+
+This guide explains how to set up and use the MagicDoc API service as an alternative to the Mineru API for document processing.
+
+## Overview
+
+The MagicDoc service provides a FastAPI-based REST API that converts various document formats (DOC, DOCX, PPT, PPTX, PDF) to markdown using the Magic-Doc library. It's designed to be compatible with your existing document processors.
+
+## Quick Start
+
+### 1. Build and Run the Service
+
+```bash
+cd magicdoc
+./start.sh
+```
+
+Or manually:
+```bash
+cd magicdoc
+docker-compose up --build -d
+```
+
+### 2. Verify the Service
+
+```bash
+# Check health
+curl http://localhost:8002/health
+
+# View API documentation
+open http://localhost:8002/docs
+```
+
+### 3. Test with Sample Files
+
+```bash
+cd magicdoc
+python test_api.py
+```
+
+## API Compatibility
+
+The MagicDoc API is designed to be compatible with your existing Mineru API interface:
+
+### Endpoint: `POST /file_parse`
+
+**Request Format:**
+- File upload via multipart form data
+- Same parameters as Mineru API (most are optional)
+
+**Response Format:**
+```json
+{
+  "markdown": "converted content",
+  "md": "converted content", 
+  "content": "converted content",
+  "text": "converted content",
+  "time_cost": 1.23,
+  "filename": "document.docx",
+  "status": "success"
+}
+```
+
+## Integration with Existing Processors
+
+To use MagicDoc instead of Mineru in your existing processors:
+
+### 1. Update Configuration
+
+Add to your settings:
+```python
+MAGICDOC_API_URL = "http://magicdoc-api:8000"  # or http://localhost:8002
+MAGICDOC_TIMEOUT = 300
+```
+
+### 2. Modify Processors
+
+Replace Mineru API calls with MagicDoc API calls. See `integration_example.py` for detailed examples.
+
+### 3. Update Docker Compose
+
+Add the MagicDoc service to your main docker-compose.yml:
+```yaml
+services:
+  magicdoc-api:
+    build:
+      context: ./magicdoc
+      dockerfile: Dockerfile
+    ports:
+      - "8002:8000"
+    volumes:
+      - ./magicdoc/storage:/app/storage
+    environment:
+      - PYTHONUNBUFFERED=1
+    restart: unless-stopped
+```
+
+## Service Architecture
+
+```
+magicdoc/
+├── app/
+│   ├── __init__.py
+│   └── main.py              # FastAPI application
+├── Dockerfile               # Container definition
+├── docker-compose.yml       # Service orchestration
+├── requirements.txt         # Python dependencies
+├── README.md               # Service documentation
+├── SETUP.md                # This setup guide
+├── test_api.py             # API testing script
+├── integration_example.py  # Integration examples
+└── start.sh                # Startup script
+```
+
+## Dependencies
+
+- **Python 3.10**: Base runtime
+- **LibreOffice**: Document processing (installed in container)
+- **Magic-Doc**: Document conversion library
+- **FastAPI**: Web framework
+- **Uvicorn**: ASGI server
+
+## Troubleshooting
+
+### Service Won't Start
+1. Check Docker is running
+2. Verify port 8002 is available
+3. Check logs: `docker-compose logs`
+
+### File Conversion Fails
+1. Verify LibreOffice is working in container
+2. Check file format is supported
+3. Review API logs for errors
+
+### Integration Issues
+1. Verify API endpoint URL
+2. Check network connectivity between services
+3. Ensure response format compatibility
+
+## Performance Considerations
+
+- MagicDoc is generally faster than Mineru for simple documents
+- LibreOffice dependency adds container size
+- Consider caching for repeated conversions
+- Monitor memory usage for large files
+
+## Security Notes
+
+- Service runs on internal network
+- File uploads are temporary
+- No persistent storage of uploaded files
+- Consider adding authentication for production use
--- a/magicdoc/app/init.py
+++ b/magicdoc/app/init.py
@ -0,0 +1 @@
+# MagicDoc FastAPI Application
--- a/magicdoc/app/main.py
+++ b/magicdoc/app/main.py
@ -0,0 +1,96 @@
+import os
+import logging
+from typing import Dict, Any, Optional
+from fastapi import FastAPI, File, UploadFile, Form, HTTPException
+from fastapi.responses import JSONResponse
+from magic_doc.docconv import DocConverter, S3Config
+import tempfile
+import shutil
+
+# Configure logging
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+
+app = FastAPI(title="MagicDoc API", version="1.0.0")
+
+# Global converter instance
+converter = DocConverter(s3_config=None)
+
+@app.get("/health")
+async def health_check():
+    """Health check endpoint"""
+    return {"status": "healthy", "service": "magicdoc-api"}
+
+@app.post("/file_parse")
+async def parse_file(
+    files: UploadFile = File(...),
+    output_dir: str = Form("./output"),
+    lang_list: str = Form("ch"),
+    backend: str = Form("pipeline"),
+    parse_method: str = Form("auto"),
+    formula_enable: bool = Form(True),
+    table_enable: bool = Form(True),
+    return_md: bool = Form(True),
+    return_middle_json: bool = Form(False),
+    return_model_output: bool = Form(False),
+    return_content_list: bool = Form(False),
+    return_images: bool = Form(False),
+    start_page_id: int = Form(0),
+    end_page_id: int = Form(99999)
+):
+    """
+    Parse document file and convert to markdown
+    Compatible with Mineru API interface
+    """
+    try:
+        logger.info(f"Processing file: {files.filename}")
+        
+        # Create temporary file to save uploaded content
+        with tempfile.NamedTemporaryFile(delete=False, suffix=os.path.splitext(files.filename)[1]) as temp_file:
+            shutil.copyfileobj(files.file, temp_file)
+            temp_file_path = temp_file.name
+        
+        try:
+            # Convert file to markdown using magic-doc
+            markdown_content, time_cost = converter.convert(temp_file_path, conv_timeout=300)
+            
+            logger.info(f"Successfully converted {files.filename} to markdown in {time_cost:.2f}s")
+            
+            # Return response compatible with Mineru API
+            response = {
+                "markdown": markdown_content,
+                "md": markdown_content,  # Alternative field name
+                "content": markdown_content,  # Alternative field name
+                "text": markdown_content,  # Alternative field name
+                "time_cost": time_cost,
+                "filename": files.filename,
+                "status": "success"
+            }
+            
+            return JSONResponse(content=response)
+            
+        finally:
+            # Clean up temporary file
+            if os.path.exists(temp_file_path):
+                os.unlink(temp_file_path)
+                
+    except Exception as e:
+        logger.error(f"Error processing file {files.filename}: {str(e)}")
+        raise HTTPException(status_code=500, detail=f"Error processing file: {str(e)}")
+
+@app.get("/")
+async def root():
+    """Root endpoint with service information"""
+    return {
+        "service": "MagicDoc API",
+        "version": "1.0.0",
+        "description": "Document to Markdown conversion service using Magic-Doc",
+        "endpoints": {
+            "health": "/health",
+            "file_parse": "/file_parse"
+        }
+    }
+
+if __name__ == "__main__":
+    import uvicorn
+    uvicorn.run(app, host="0.0.0.0", port=8000)
--- a/magicdoc/docker-compose.yml
+++ b/magicdoc/docker-compose.yml
@ -0,0 +1,26 @@
+version: '3.8'
+
+services:
+  magicdoc-api:
+    build:
+      context: .
+      dockerfile: Dockerfile
+    platform: linux/amd64
+    ports:
+      - "8002:8000"
+    volumes:
+      - ./storage/uploads:/app/storage/uploads
+      - ./storage/processed:/app/storage/processed
+    environment:
+      - PYTHONUNBUFFERED=1
+    restart: unless-stopped
+    healthcheck:
+      test: ["CMD", "curl", "-f", "http://localhost:8000/health"]
+      interval: 30s
+      timeout: 10s
+      retries: 3
+      start_period: 60s
+
+volumes:
+  uploads:
+  processed:
--- a/magicdoc/integration_example.py
+++ b/magicdoc/integration_example.py
@ -0,0 +1,144 @@
+"""
+Example of how to integrate MagicDoc API with existing document processors
+"""
+
+# Example modification for docx_processor.py
+# Replace the Mineru API configuration with MagicDoc API configuration
+
+class DocxDocumentProcessor(DocumentProcessor):
+    def __init__(self, input_path: str, output_path: str):
+        super().__init__()
+        self.input_path = input_path
+        self.output_path = output_path
+        self.output_dir = os.path.dirname(output_path)
+        self.name_without_suff = os.path.splitext(os.path.basename(input_path))[0]
+        
+        # Setup work directory for temporary files
+        self.work_dir = os.path.join(
+            os.path.dirname(output_path), 
+            ".work", 
+            os.path.splitext(os.path.basename(input_path))[0]
+        )
+        os.makedirs(self.work_dir, exist_ok=True)
+        
+        self.ollama_client = OllamaClient(model_name=settings.OLLAMA_MODEL, base_url=settings.OLLAMA_API_URL)
+        
+        # MagicDoc API configuration (instead of Mineru)
+        self.magicdoc_base_url = getattr(settings, 'MAGICDOC_API_URL', 'http://magicdoc-api:8000')
+        self.magicdoc_timeout = getattr(settings, 'MAGICDOC_TIMEOUT', 300)  # 5 minutes timeout
+
+    def _call_magicdoc_api(self, file_path: str) -> Optional[Dict[str, Any]]:
+        """
+        Call MagicDoc API to convert DOCX to markdown
+        
+        Args:
+            file_path: Path to the DOCX file
+            
+        Returns:
+            API response as dictionary or None if failed
+        """
+        try:
+            url = f"{self.magicdoc_base_url}/file_parse"
+            
+            with open(file_path, 'rb') as file:
+                files = {'files': (os.path.basename(file_path), file, 'application/vnd.openxmlformats-officedocument.wordprocessingml.document')}
+                
+                # Prepare form data - simplified compared to Mineru
+                data = {
+                    'output_dir': './output',
+                    'lang_list': 'ch',
+                    'backend': 'pipeline',
+                    'parse_method': 'auto',
+                    'formula_enable': True,
+                    'table_enable': True,
+                    'return_md': True,
+                    'return_middle_json': False,
+                    'return_model_output': False,
+                    'return_content_list': False,
+                    'return_images': False,
+                    'start_page_id': 0,
+                    'end_page_id': 99999
+                }
+                
+                logger.info(f"Calling MagicDoc API for DOCX processing at {url}")
+                response = requests.post(
+                    url, 
+                    files=files,
+                    data=data,
+                    timeout=self.magicdoc_timeout
+                )
+                
+                if response.status_code == 200:
+                    result = response.json()
+                    logger.info("Successfully received response from MagicDoc API for DOCX")
+                    return result
+                else:
+                    error_msg = f"MagicDoc API returned status code {response.status_code}: {response.text}"
+                    logger.error(error_msg)
+                    raise Exception(error_msg)
+                    
+        except requests.exceptions.Timeout:
+            error_msg = f"MagicDoc API request timed out after {self.magicdoc_timeout} seconds"
+            logger.error(error_msg)
+            raise Exception(error_msg)
+        except requests.exceptions.RequestException as e:
+            error_msg = f"Error calling MagicDoc API for DOCX: {str(e)}"
+            logger.error(error_msg)
+            raise Exception(error_msg)
+        except Exception as e:
+            error_msg = f"Unexpected error calling MagicDoc API for DOCX: {str(e)}"
+            logger.error(error_msg)
+            raise Exception(error_msg)
+
+    def read_content(self) -> str:
+        logger.info("Starting DOCX content processing with MagicDoc API")
+        
+        # Call MagicDoc API to convert DOCX to markdown
+        magicdoc_response = self._call_magicdoc_api(self.input_path)
+        
+        # Extract markdown content from the response
+        markdown_content = self._extract_markdown_from_response(magicdoc_response)
+        
+        if not markdown_content:
+            raise Exception("No markdown content found in MagicDoc API response for DOCX")
+        
+        logger.info(f"Successfully extracted {len(markdown_content)} characters of markdown content from DOCX")
+        
+        # Save the raw markdown content to work directory for reference
+        md_output_path = os.path.join(self.work_dir, f"{self.name_without_suff}.md")
+        with open(md_output_path, 'w', encoding='utf-8') as file:
+            file.write(markdown_content)
+        
+        logger.info(f"Saved raw markdown content from DOCX to {md_output_path}")
+        
+        return markdown_content
+
+# Configuration changes needed in settings.py:
+"""
+# Add these settings to your configuration
+MAGICDOC_API_URL = "http://magicdoc-api:8000"  # or http://localhost:8002 for local development
+MAGICDOC_TIMEOUT = 300  # 5 minutes timeout
+"""
+
+# Docker Compose integration:
+"""
+# Add to your main docker-compose.yml
+services:
+  magicdoc-api:
+    build:
+      context: ./magicdoc
+      dockerfile: Dockerfile
+    ports:
+      - "8002:8000"
+    volumes:
+      - ./magicdoc/storage:/app/storage
+    environment:
+      - PYTHONUNBUFFERED=1
+    restart: unless-stopped
+    healthcheck:
+      test: ["CMD", "curl", "-f", "http://localhost:8000/health"]
+      interval: 30s
+      timeout: 10s
+      retries: 3
+      start_period: 60s
+"""
--- a/magicdoc/requirements.txt
+++ b/magicdoc/requirements.txt
@ -0,0 +1,7 @@
+fastapi==0.104.1
+uvicorn[standard]==0.24.0
+python-multipart==0.0.6
+# fairy-doc[cpu]==0.1.0
+pydantic==2.5.0
+numpy==1.24.3
+opencv-python==4.8.1.78
--- a/magicdoc/start.sh
+++ b/magicdoc/start.sh
@ -0,0 +1,34 @@
+#!/bin/bash
+
+# MagicDoc API Service Startup Script
+
+echo "Starting MagicDoc API Service..."
+
+# Check if Docker is running
+if ! docker info > /dev/null 2>&1; then
+    echo "Error: Docker is not running. Please start Docker first."
+    exit 1
+fi
+
+# Build and start the service
+echo "Building and starting MagicDoc API service..."
+docker-compose up --build -d
+
+# Wait for service to be ready
+echo "Waiting for service to be ready..."
+sleep 10
+
+# Check health
+echo "Checking service health..."
+if curl -f http://localhost:8002/health > /dev/null 2>&1; then
+    echo "✅ MagicDoc API service is running successfully!"
+    echo "🌐 Service URL: http://localhost:8002"
+    echo "📖 API Documentation: http://localhost:8002/docs"
+    echo "🔍 Health Check: http://localhost:8002/health"
+else
+    echo "❌ Service health check failed. Check logs with: docker-compose logs"
+fi
+
+echo ""
+echo "To stop the service, run: docker-compose down"
+echo "To view logs, run: docker-compose logs -f"
--- a/magicdoc/test_api.py
+++ b/magicdoc/test_api.py
@ -0,0 +1,92 @@
+#!/usr/bin/env python3
+"""
+Test script for MagicDoc API
+"""
+
+import requests
+import json
+import os
+
+def test_health_check(base_url="http://localhost:8002"):
+    """Test health check endpoint"""
+    try:
+        response = requests.get(f"{base_url}/health")
+        print(f"Health check status: {response.status_code}")
+        print(f"Response: {response.json()}")
+        return response.status_code == 200
+    except Exception as e:
+        print(f"Health check failed: {e}")
+        return False
+
+def test_file_parse(base_url="http://localhost:8002", file_path=None):
+    """Test file parse endpoint"""
+    if not file_path or not os.path.exists(file_path):
+        print(f"File not found: {file_path}")
+        return False
+    
+    try:
+        with open(file_path, 'rb') as f:
+            files = {'files': (os.path.basename(file_path), f, 'application/octet-stream')}
+            data = {
+                'output_dir': './output',
+                'lang_list': 'ch',
+                'backend': 'pipeline',
+                'parse_method': 'auto',
+                'formula_enable': True,
+                'table_enable': True,
+                'return_md': True,
+                'return_middle_json': False,
+                'return_model_output': False,
+                'return_content_list': False,
+                'return_images': False,
+                'start_page_id': 0,
+                'end_page_id': 99999
+            }
+            
+            response = requests.post(f"{base_url}/file_parse", files=files, data=data)
+            print(f"File parse status: {response.status_code}")
+            
+            if response.status_code == 200:
+                result = response.json()
+                print(f"Success! Converted {len(result.get('markdown', ''))} characters")
+                print(f"Time cost: {result.get('time_cost', 'N/A')}s")
+                return True
+            else:
+                print(f"Error: {response.text}")
+                return False
+                
+    except Exception as e:
+        print(f"File parse failed: {e}")
+        return False
+
+def main():
+    """Main test function"""
+    print("Testing MagicDoc API...")
+    
+    # Test health check
+    print("\n1. Testing health check...")
+    if not test_health_check():
+        print("Health check failed. Make sure the service is running.")
+        return
+    
+    # Test file parse (if sample file exists)
+    print("\n2. Testing file parse...")
+    sample_files = [
+        "../sample_doc/20220707_na_decision-2.docx",
+        "../sample_doc/20220707_na_decision-2.pdf",
+        "../sample_doc/short_doc.md"
+    ]
+    
+    for sample_file in sample_files:
+        if os.path.exists(sample_file):
+            print(f"Testing with {sample_file}...")
+            if test_file_parse(file_path=sample_file):
+                print("File parse test passed!")
+                break
+        else:
+            print(f"Sample file not found: {sample_file}")
+    
+    print("\nTest completed!")
+
+if __name__ == "__main__":
+    main()
Author	SHA1	Message	Date
tigermren	2075218955	feat: 正式fully支持docx	2025-08-18 01:15:40 +08:00
tigermren	afddcf4dd7	fix: 解决magic-doc包的问题	2025-08-18 01:01:58 +08:00
tigermren	0820d7bba2	feat：新增magicdoc	2025-08-18 00:40:39 +08:00
tigermren	a16b69475e	refine: 整理文件	2025-08-17 23:33:56 +08:00