重新启用pdf_processor

2025-07-14 23:49:28 +08:00 · 2025-07-14 23:49:28 +08:00 · d3e1927bc5
parent e8cb7b1a04
commit d3e1927bc5
9 changed files with 527 additions and 109 deletions
--- a/backend/PDF_PROCESSOR_README.md
+++ b/backend/PDF_PROCESSOR_README.md
@ -0,0 +1,176 @@
+# PDF Processor with Mineru API
+
+## Overview
+
+The PDF processor has been rewritten to use Mineru's REST API instead of the magic_pdf library. This provides better separation of concerns and allows for more flexible deployment options.
+
+## Changes Made
+
+### 1. Removed Dependencies
+- Removed all `magic_pdf` imports and dependencies
+- Removed `PyPDF2` direct usage (though kept in requirements for potential other uses)
+
+### 2. New Implementation
+- **REST API Integration**: Uses HTTP requests to call Mineru's API
+- **Configurable Settings**: Mineru API URL and timeout are configurable
+- **Error Handling**: Comprehensive error handling for network issues, timeouts, and API errors
+- **Flexible Response Parsing**: Handles multiple possible response formats from Mineru API
+
+### 3. Configuration
+
+Add the following settings to your environment or `.env` file:
+
+```bash
+# Mineru API Configuration
+MINERU_API_URL=http://mineru-api:8000
+MINERU_TIMEOUT=300
+```
+
+### 4. API Endpoint
+
+The processor expects Mineru to provide a REST API endpoint at `/extract` that accepts PDF files via multipart form data and returns JSON with markdown content.
+
+#### Expected Request Format:
+```
+POST /extract
+Content-Type: multipart/form-data
+
+file: [PDF file]
+```
+
+#### Expected Response Format:
+The processor can handle multiple response formats:
+
+```json
+{
+  "markdown": "# Document Title\n\nContent here..."
+}
+```
+
+OR
+
+```json
+{
+  "content": "# Document Title\n\nContent here..."
+}
+```
+
+OR
+
+```json
+{
+  "result": {
+    "markdown": "# Document Title\n\nContent here..."
+  }
+}
+```
+
+## Usage
+
+### Basic Usage
+
+```python
+from app.core.document_handlers.processors.pdf_processor import PdfDocumentProcessor
+
+# Create processor instance
+processor = PdfDocumentProcessor("input.pdf", "output.md")
+
+# Read and convert PDF to markdown
+content = processor.read_content()
+
+# Process content (apply masking)
+processed_content = processor.process_content(content)
+
+# Save processed content
+processor.save_content(processed_content)
+```
+
+### Through Document Service
+
+```python
+from app.core.services.document_service import DocumentService
+
+service = DocumentService()
+success = service.process_document("input.pdf", "output.md")
+```
+
+## Testing
+
+Run the test script to verify the implementation:
+
+```bash
+cd backend
+python test_pdf_processor.py
+```
+
+Make sure you have:
+1. A sample PDF file in the `sample_doc/` directory
+2. Mineru API service running and accessible
+3. Proper network connectivity between services
+
+## Error Handling
+
+The processor handles various error scenarios:
+
+- **Network Timeouts**: Configurable timeout (default: 5 minutes)
+- **API Errors**: HTTP status code errors are logged and handled
+- **Response Parsing**: Multiple fallback strategies for extracting markdown content
+- **File Operations**: Proper error handling for file reading/writing
+
+## Logging
+
+The processor provides detailed logging for debugging:
+
+- API call attempts and responses
+- Content extraction results
+- Error conditions and stack traces
+- Processing statistics
+
+## Deployment
+
+### Docker Compose
+
+Ensure your Mineru service is running and accessible. The default configuration expects it at `http://mineru-api:8000`.
+
+### Environment Variables
+
+Set the following environment variables in your deployment:
+
+```bash
+MINERU_API_URL=http://your-mineru-service:8000
+MINERU_TIMEOUT=300
+```
+
+## Troubleshooting
+
+### Common Issues
+
+1. **Connection Refused**: Check if Mineru service is running and accessible
+2. **Timeout Errors**: Increase `MINERU_TIMEOUT` for large PDF files
+3. **Empty Content**: Check Mineru API response format and logs
+4. **Network Issues**: Verify network connectivity between services
+
+### Debug Mode
+
+Enable debug logging to see detailed API interactions:
+
+```python
+import logging
+logging.getLogger('app.core.document_handlers.processors.pdf_processor').setLevel(logging.DEBUG)
+```
+
+## Migration from magic_pdf
+
+If you were previously using magic_pdf:
+
+1. **No Code Changes Required**: The interface remains the same
+2. **Configuration Update**: Add Mineru API settings
+3. **Service Dependencies**: Ensure Mineru service is running
+4. **Testing**: Run the test script to verify functionality
+
+## Performance Considerations
+
+- **Timeout**: Large PDFs may require longer timeouts
+- **Memory**: The processor loads the entire PDF into memory for API calls
+- **Network**: API calls add network latency to processing time
+- **Caching**: Consider implementing caching for frequently processed documents 
--- a/backend/app/core/config.py
+++ b/backend/app/core/config.py
@ -31,6 +31,10 @@ class Settings(BaseSettings):
    OLLAMA_API_KEY: str = ""
    OLLAMA_MODEL: str = "llama2"

+    # Mineru API settings
+    MINERU_API_URL: str = "http://mineru-api:8000"
+    MINERU_TIMEOUT: int = 300  # 5 minutes timeout
+
    # Logging settings
    LOG_LEVEL: str = "INFO"
    LOG_FORMAT: str = "%(asctime)s - %(name)s - %(levelname)s - %(message)s"
--- a/backend/app/core/document_handlers/document_factory.py
+++ b/backend/app/core/document_handlers/document_factory.py
@ -4,7 +4,7 @@ from .document_processor import DocumentProcessor
 from .processors import (
    TxtDocumentProcessor,
    # DocxDocumentProcessor,
-    # PdfDocumentProcessor,
+    PdfDocumentProcessor,
    MarkdownDocumentProcessor
 )

@ -17,7 +17,7 @@ class DocumentProcessorFactory:
            '.txt': TxtDocumentProcessor,
            # '.docx': DocxDocumentProcessor,
            # '.doc': DocxDocumentProcessor,
-            # '.pdf': PdfDocumentProcessor,
+            '.pdf': PdfDocumentProcessor,
            '.md': MarkdownDocumentProcessor,
            '.markdown': MarkdownDocumentProcessor
        }
--- a/backend/app/core/document_handlers/processors/init.py
+++ b/backend/app/core/document_handlers/processors/init.py
@ -1,7 +1,7 @@
 from .txt_processor import TxtDocumentProcessor
 # from .docx_processor import DocxDocumentProcessor
-# from .pdf_processor import PdfDocumentProcessor
+from .pdf_processor import PdfDocumentProcessor
 from .md_processor import MarkdownDocumentProcessor

 # __all__ = ['TxtDocumentProcessor', 'DocxDocumentProcessor', 'PdfDocumentProcessor', 'MarkdownDocumentProcessor']
-__all__ = ['TxtDocumentProcessor', 'MarkdownDocumentProcessor']
+__all__ = ['TxtDocumentProcessor', 'PdfDocumentProcessor', 'MarkdownDocumentProcessor']
--- a/backend/app/core/document_handlers/processors/pdf_processor.py
+++ b/backend/app/core/document_handlers/processors/pdf_processor.py
@ -0,0 +1,154 @@
+import os
+import requests
+import logging
+from typing import Dict, Any, Optional
+from ...document_handlers.document_processor import DocumentProcessor
+from ...services.ollama_client import OllamaClient
+from ...config import settings
+
+logger = logging.getLogger(__name__)
+
+class PdfDocumentProcessor(DocumentProcessor):
+    def __init__(self, input_path: str, output_path: str):
+        super().__init__()  # Call parent class's __init__
+        self.input_path = input_path
+        self.output_path = output_path
+        self.output_dir = os.path.dirname(output_path)
+        self.name_without_suff = os.path.splitext(os.path.basename(input_path))[0]
+        
+        # Setup work directory for temporary files
+        self.work_dir = os.path.join(
+            os.path.dirname(output_path), 
+            ".work", 
+            os.path.splitext(os.path.basename(input_path))[0]
+        )
+        os.makedirs(self.work_dir, exist_ok=True)
+        
+        self.ollama_client = OllamaClient(model_name=settings.OLLAMA_MODEL, base_url=settings.OLLAMA_API_URL)
+        
+        # Mineru API configuration
+        self.mineru_base_url = getattr(settings, 'MINERU_API_URL', 'http://mineru-api:8000')
+        self.mineru_timeout = getattr(settings, 'MINERU_TIMEOUT', 300)  # 5 minutes timeout
+
+    def _call_mineru_api(self, file_path: str) -> Optional[Dict[str, Any]]:
+        """
+        Call Mineru API to convert PDF to markdown
+        
+        Args:
+            file_path: Path to the PDF file
+            
+        Returns:
+            API response as dictionary or None if failed
+        """
+        try:
+            url = f"{self.mineru_base_url}/extract"
+            
+            with open(file_path, 'rb') as file:
+                files = {'file': (os.path.basename(file_path), file, 'application/pdf')}
+                
+                logger.info(f"Calling Mineru API at {url}")
+                response = requests.post(
+                    url, 
+                    files=files,
+                    timeout=self.mineru_timeout
+                )
+                
+                if response.status_code == 200:
+                    result = response.json()
+                    logger.info("Successfully received response from Mineru API")
+                    return result
+                else:
+                    logger.error(f"Mineru API returned status code {response.status_code}: {response.text}")
+                    return None
+                    
+        except requests.exceptions.Timeout:
+            logger.error(f"Mineru API request timed out after {self.mineru_timeout} seconds")
+            return None
+        except requests.exceptions.RequestException as e:
+            logger.error(f"Error calling Mineru API: {str(e)}")
+            return None
+        except Exception as e:
+            logger.error(f"Unexpected error calling Mineru API: {str(e)}")
+            return None
+
+    def _extract_markdown_from_response(self, response: Dict[str, Any]) -> str:
+        """
+        Extract markdown content from Mineru API response
+        
+        Args:
+            response: Mineru API response dictionary
+            
+        Returns:
+            Extracted markdown content as string
+        """
+        try:
+            # Try different possible response formats
+            if 'markdown' in response:
+                return response['markdown']
+            elif 'content' in response:
+                return response['content']
+            elif 'text' in response:
+                return response['text']
+            elif 'result' in response and isinstance(response['result'], dict):
+                result = response['result']
+                if 'markdown' in result:
+                    return result['markdown']
+                elif 'content' in result:
+                    return result['content']
+                elif 'text' in result:
+                    return result['text']
+            else:
+                # If no standard format found, try to extract from the response structure
+                logger.warning("Could not find standard markdown field in Mineru response")
+                logger.debug(f"Mineru response structure: {response}")
+                
+                # Return the response as string if it's simple, or empty string
+                if isinstance(response, str):
+                    return response
+                elif isinstance(response, dict):
+                    # Try to find any text-like content
+                    for key, value in response.items():
+                        if isinstance(value, str) and len(value) > 100:  # Likely content
+                            return value
+                
+                return ""
+                
+        except Exception as e:
+            logger.error(f"Error extracting markdown from Mineru response: {str(e)}")
+            return ""
+
+    def read_content(self) -> str:
+        logger.info("Starting PDF content processing with Mineru API")
+        
+        # Call Mineru API to convert PDF to markdown
+        mineru_response = self._call_mineru_api(self.input_path)
+        
+        if not mineru_response:
+            raise Exception("Failed to get response from Mineru API")
+        
+        # Extract markdown content from the response
+        markdown_content = self._extract_markdown_from_response(mineru_response)
+        
+        if not markdown_content:
+            raise Exception("No markdown content found in Mineru API response")
+        
+        logger.info(f"Successfully extracted {len(markdown_content)} characters of markdown content")
+        
+        # Save the raw markdown content to work directory for reference
+        md_output_path = os.path.join(self.work_dir, f"{self.name_without_suff}.md")
+        with open(md_output_path, 'w', encoding='utf-8') as file:
+            file.write(markdown_content)
+        
+        logger.info(f"Saved raw markdown content to {md_output_path}")
+        
+        return markdown_content
+
+    def save_content(self, content: str) -> None:
+        # Ensure output path has .md extension
+        output_dir = os.path.dirname(self.output_path)
+        base_name = os.path.splitext(os.path.basename(self.output_path))[0]
+        md_output_path = os.path.join(output_dir, f"{base_name}.md")
+        
+        logger.info(f"Saving masked content to: {md_output_path}")
+        with open(md_output_path, 'w', encoding='utf-8') as file:
+            file.write(content)
--- a/backend/app/core/document_handlers/processors/pdf_processor.py.backup
+++ b/backend/app/core/document_handlers/processors/pdf_processor.py.backup
@ -1,105 +0,0 @@
-import os
-import PyPDF2
-from ...document_handlers.document_processor import DocumentProcessor
-from magic_pdf.data.data_reader_writer import FileBasedDataWriter, FileBasedDataReader
-from magic_pdf.data.dataset import PymuDocDataset
-from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
-from magic_pdf.config.enums import SupportedPdfParseMethod
-from ...prompts.masking_prompts import get_masking_prompt, get_masking_mapping_prompt
-import logging
-from ...services.ollama_client import OllamaClient
-from ...config import settings
-
-logger = logging.getLogger(__name__)
-
-class PdfDocumentProcessor(DocumentProcessor):
-    def __init__(self, input_path: str, output_path: str):
-        super().__init__()  # Call parent class's __init__
-        self.input_path = input_path
-        self.output_path = output_path
-        self.output_dir = os.path.dirname(output_path)
-        self.name_without_suff = os.path.splitext(os.path.basename(input_path))[0]
-        
-        # Setup output directories
-        self.local_image_dir = os.path.join(self.output_dir, "images")
-        self.image_dir = os.path.basename(self.local_image_dir)
-        os.makedirs(self.local_image_dir, exist_ok=True)
-
-        # Setup work directory under output directory
-        self.work_dir = os.path.join(
-            os.path.dirname(output_path), 
-            ".work", 
-            os.path.splitext(os.path.basename(input_path))[0]
-        )
-        os.makedirs(self.work_dir, exist_ok=True)
-
-        self.work_local_image_dir = os.path.join(self.work_dir, "images")
-        self.work_image_dir = os.path.basename(self.work_local_image_dir)
-        os.makedirs(self.work_local_image_dir, exist_ok=True)   
-        self.ollama_client = OllamaClient(model_name=settings.OLLAMA_MODEL, base_url=settings.OLLAMA_API_URL)
-
-    def read_content(self) -> str:
-        logger.info("Starting PDF content processing")
-        
-        # Read the PDF file
-        with open(self.input_path, 'rb') as file:
-            content = file.read()
-
-        # Initialize writers
-        image_writer = FileBasedDataWriter(self.work_local_image_dir)
-        md_writer = FileBasedDataWriter(self.work_dir)
-
-        # Create Dataset Instance
-        ds = PymuDocDataset(content)
-        
-        logger.info("Classifying PDF type: %s", ds.classify())
-        # Process based on PDF type
-        if ds.classify() == SupportedPdfParseMethod.OCR:
-            infer_result = ds.apply(doc_analyze, ocr=True)
-            pipe_result = infer_result.pipe_ocr_mode(image_writer)
-        else:
-            infer_result = ds.apply(doc_analyze, ocr=False)
-            pipe_result = infer_result.pipe_txt_mode(image_writer)
-        
-        logger.info("Generating all outputs")
-        # Generate all outputs
-        infer_result.draw_model(os.path.join(self.work_dir, f"{self.name_without_suff}_model.pdf"))
-        model_inference_result = infer_result.get_infer_res()
-        
-        pipe_result.draw_layout(os.path.join(self.work_dir, f"{self.name_without_suff}_layout.pdf"))
-        pipe_result.draw_span(os.path.join(self.work_dir, f"{self.name_without_suff}_spans.pdf"))
-        
-        md_content = pipe_result.get_markdown(self.work_image_dir)
-        pipe_result.dump_md(md_writer, f"{self.name_without_suff}.md", self.work_image_dir)
-        
-        content_list = pipe_result.get_content_list(self.work_image_dir)
-        pipe_result.dump_content_list(md_writer, f"{self.name_without_suff}_content_list.json", self.work_image_dir)
-        
-        middle_json = pipe_result.get_middle_json()
-        pipe_result.dump_middle_json(md_writer, f'{self.name_without_suff}_middle.json')
-
-        return md_content
-
-    # def process_content(self, content: str) -> str:
-    #     logger.info("Starting content masking process")
-    #     sentences = content.split("。")
-    #     final_md = ""
-    #     for sentence in sentences:
-    #         if not sentence.strip():  # Skip empty sentences
-    #             continue
-    #         formatted_prompt = get_masking_mapping_prompt(sentence)
-    #         logger.info("Calling ollama to generate response, prompt: %s", formatted_prompt)
-    #         response = self.ollama_client.generate(formatted_prompt)
-    #         logger.info(f"Response generated: {response}")
-    #         final_md += response + "。"
-    #     return final_md
-
-    def save_content(self, content: str) -> None:
-        # Ensure output path has .md extension
-        output_dir = os.path.dirname(self.output_path)
-        base_name = os.path.splitext(os.path.basename(self.output_path))[0]
-        md_output_path = os.path.join(output_dir, f"{base_name}.md")
-        
-        logger.info(f"Saving masked content to: {md_output_path}")
-        with open(md_output_path, 'w', encoding='utf-8') as file:
-            file.write(content)
--- a/backend/log
+++ b/backend/log
@ -0,0 +1,127 @@
+ [2025-07-14 14:20:19,015: INFO/ForkPoolWorker-4] Raw response from LLM: {
+celery_worker-1  |   "entities": []
+celery_worker-1  | }
+celery_worker-1  | [2025-07-14 14:20:19,016: INFO/ForkPoolWorker-4] Parsed mapping: {'entities': []}
+celery_worker-1  | [2025-07-14 14:20:19,020: INFO/ForkPoolWorker-4] Calling ollama to generate case numbers mapping for chunk (attempt 1/3): 
+celery_worker-1  | 你是一个专业的法律文本实体识别助手。请从以下文本中抽取出所有需要脱敏的敏感信息，并按照指定的类别进行分类。请严格按照JSON格式输出结果。
+celery_worker-1  | 
+celery_worker-1  | 实体类别包括:
+celery_worker-1  | - 案号
+celery_worker-1  | 
+celery_worker-1  | 待处理文本:
+celery_worker-1  |   
+celery_worker-1  | 
+celery_worker-1  | 二审案件受理费450892 元，由北京丰复久信营销科技有限公司负担（已交纳）。  
+celery_worker-1  | 
+celery_worker-1  | 29. 本判决为终审判决。  
+celery_worker-1  | 
+celery_worker-1  | 审 判 长 史晓霞审 判 员 邓青菁审 判 员 李 淼二〇二二年七月七日法 官 助 理 黎 铧书 记 员 郑海兴    
+celery_worker-1  | 
+celery_worker-1  | 输出格式:
+celery_worker-1  | {
+celery_worker-1  | "entities": [
+celery_worker-1  |     {"text": "原始文本内容", "type": "案号"},
+celery_worker-1  |     ...
+celery_worker-1  |   ]
+celery_worker-1  | }
+celery_worker-1  | 
+celery_worker-1  | 请严格按照JSON格式输出结果。
+celery_worker-1  | 
+api-1            | INFO:     192.168.65.1:60045 - "GET /api/v1/files/files HTTP/1.1" 200 OK
+api-1            | INFO:     192.168.65.1:34054 - "GET /api/v1/files/files HTTP/1.1" 200 OK
+api-1            | INFO:     192.168.65.1:34054 - "GET /api/v1/files/files HTTP/1.1" 200 OK
+api-1            | INFO:     192.168.65.1:22084 - "GET /api/v1/files/files HTTP/1.1" 200 OK
+celery_worker-1  | [2025-07-14 14:20:31,279: INFO/ForkPoolWorker-4] Raw response from LLM: {
+celery_worker-1  |   "entities": []
+celery_worker-1  | }
+celery_worker-1  | [2025-07-14 14:20:31,281: INFO/ForkPoolWorker-4] Parsed mapping: {'entities': []}
+celery_worker-1  | [2025-07-14 14:20:31,287: INFO/ForkPoolWorker-4] Chunk mapping: [{'entities': []}, {'entities': [{'text': '北京丰复久信营销科技有限公司', 'type': '公司名称'}]}, {'entities': []}, {'entities': []}, {'entities': []}]
+celery_worker-1  | [2025-07-14 14:20:31,288: INFO/ForkPoolWorker-4] Final chunk mappings: [{'entities': [{'text': '郭东军', 'type': '人名'}, {'text': '王欢子', 'type': '人名'}]}, {'entities': [{'text': '北京丰复久信营销科技有限公司', 'type': '公司名称'}, {'text': '丰复久信公司', 'type': '公司名称简称'}, {'text': '中研智创区块链技术有限公司', 'type': '公司名称'}, {'text': '中研智才公司', 'type': '公司名称简称'}]}, {'entities': [{'text': '北京市海淀区北小马厂6 号1 号楼华天大厦1306 室', 'type': '地址'}, {'text': '天津市津南区双港镇工业园区优谷产业园5 号楼-1505', 'type': '地址'}]}, {'entities': [{'text': '服务合同', 'type': '项目名'}]}, {'entities': [{'text': '(2022)京 03 民终 3852 号', 'type': '案号'}, {'text': '（2020）京0105 民初69754 号', 'type': '案号'}]}, {'entities': [{'text': '李圣艳', 'type': '人名'}, {'text': '闫向东', 'type': '人名'}, {'text': '李敏', 'type': '人名'}, {'text': '布兰登·斯密特', 'type': '英文人名'}]}, {'entities': [{'text': '丰复久信公司', 'type': '公司名称'}, {'text': '中研智创公司', 'type': '公司名称'}, {'text': '丰复久信', 'type': '公司名称简称'}, {'text': '中研智创', 'type': '公司名称简称'}]}, {'entities': [{'text': '上海市', 'type': '地址'}, {'text': '北京', 'type': '地址'}]}, {'entities': [{'text': '《计算机设备采购合同》', 'type': '项目名'}]}, {'entities': []}, {'entities': []}, {'entities': [{'text': '丰复久信公司', 'type': '公司名称'}, {'text': '中研智创公司', 'type': '公司名称'}]}, {'entities': []}, {'entities': [{'text': '《服务合同书》', 'type': '项目名'}]}, {'entities': []}, {'entities': []}, {'entities': [{'text': '北京丰复久信营销科技有限公司', 'type': '公司名称'}]}, {'entities': []}, {'entities': []}, {'entities': []}]
+celery_worker-1  | [2025-07-14 14:20:31,288: INFO/ForkPoolWorker-4] Duplicate entity found: {'text': '丰复久信公司', 'type': '公司名称'}
+celery_worker-1  | [2025-07-14 14:20:31,288: INFO/ForkPoolWorker-4] Duplicate entity found: {'text': '丰复久信公司', 'type': '公司名称'}
+celery_worker-1  | [2025-07-14 14:20:31,288: INFO/ForkPoolWorker-4] Duplicate entity found: {'text': '中研智创公司', 'type': '公司名称'}
+celery_worker-1  | [2025-07-14 14:20:31,288: INFO/ForkPoolWorker-4] Duplicate entity found: {'text': '北京丰复久信营销科技有限公司', 'type': '公司名称'}
+celery_worker-1  | [2025-07-14 14:20:31,288: INFO/ForkPoolWorker-4] Merged 22 unique entities
+celery_worker-1  | [2025-07-14 14:20:31,288: INFO/ForkPoolWorker-4] Unique entities: [{'text': '郭东军', 'type': '人名'}, {'text': '王欢子', 'type': '人名'}, {'text': '北京丰复久信营销科技有限公司', 'type': '公司名称'}, {'text': '丰复久信公司', 'type': '公司名称简称'}, {'text': '中研智创区块链技术有限公司', 'type': '公司名称'}, {'text': '中研智才公司', 'type': '公司名称简称'}, {'text': '北京市海淀区北小马厂6 号1 号楼华天大厦1306 室', 'type': '地址'}, {'text': '天津市津南区双港镇工业园区优谷产业园5 号楼-1505', 'type': '地址'}, {'text': '服务合同', 'type': '项目名'}, {'text': '(2022)京 03 民终 3852 号', 'type': '案号'}, {'text': '（2020）京0105 民初69754 号', 'type': '案号'}, {'text': '李圣艳', 'type': '人名'}, {'text': '闫向东', 'type': '人名'}, {'text': '李敏', 'type': '人名'}, {'text': '布兰登·斯密特', 'type': '英文人名'}, {'text': '中研智创公司', 'type': '公司名称'}, {'text': '丰复久信', 'type': '公司名称简称'}, {'text': '中研智创', 'type': '公司名称简称'}, {'text': '上海市', 'type': '地址'}, {'text': '北京', 'type': '地址'}, {'text': '《计算机设备采购合同》', 'type': '项目名'}, {'text': '《服务合同书》', 'type': '项目名'}]
+celery_worker-1  | [2025-07-14 14:20:31,289: INFO/ForkPoolWorker-4] Calling ollama to generate entity linkage (attempt 1/3)
+api-1            | INFO:     192.168.65.1:52168 - "GET /api/v1/files/files HTTP/1.1" 200 OK
+api-1            | INFO:     192.168.65.1:61426 - "GET /api/v1/files/files HTTP/1.1" 200 OK
+api-1            | INFO:     192.168.65.1:30702 - "GET /api/v1/files/files HTTP/1.1" 200 OK
+api-1            | INFO:     192.168.65.1:48159 - "GET /api/v1/files/files HTTP/1.1" 200 OK
+api-1            | INFO:     192.168.65.1:16860 - "GET /api/v1/files/files HTTP/1.1" 200 OK
+api-1            | INFO:     192.168.65.1:21262 - "GET /api/v1/files/files HTTP/1.1" 200 OK
+api-1            | INFO:     192.168.65.1:45564 - "GET /api/v1/files/files HTTP/1.1" 200 OK
+api-1            | INFO:     192.168.65.1:32142 - "GET /api/v1/files/files HTTP/1.1" 200 OK
+api-1            | INFO:     192.168.65.1:27769 - "GET /api/v1/files/files HTTP/1.1" 200 OK
+api-1            | INFO:     192.168.65.1:21196 - "GET /api/v1/files/files HTTP/1.1" 200 OK
+celery_worker-1  | [2025-07-14 14:21:21,436: INFO/ForkPoolWorker-4] Raw entity linkage response from LLM: {
+celery_worker-1  |   "entity_groups": [
+celery_worker-1  |     {
+celery_worker-1  |       "group_id": "group_1",
+celery_worker-1  |       "group_type": "公司名称",
+celery_worker-1  |       "entities": [
+celery_worker-1  |         {
+celery_worker-1  |           "text": "北京丰复久信营销科技有限公司",
+celery_worker-1  |           "type": "公司名称",
+celery_worker-1  |           "is_primary": true
+celery_worker-1  |         },
+celery_worker-1  |         {
+celery_worker-1  |           "text": "丰复久信公司",
+celery_worker-1  |           "type": "公司名称简称",
+celery_worker-1  |           "is_primary": false
+celery_worker-1  |         },
+celery_worker-1  |         {
+celery_worker-1  |           "text": "丰复久信",
+celery_worker-1  |           "type": "公司名称简称",
+celery_worker-1  |           "is_primary": false
+celery_worker-1  |         }
+celery_worker-1  |       ]
+celery_worker-1  |     },
+celery_worker-1  |     {
+celery_worker-1  |       "group_id": "group_2",
+celery_worker-1  |       "group_type": "公司名称",
+celery_worker-1  |       "entities": [
+celery_worker-1  |         {
+celery_worker-1  |           "text": "中研智创区块链技术有限公司",
+celery_worker-1  |           "type": "公司名称",
+celery_worker-1  |           "is_primary": true
+celery_worker-1  |         },
+celery_worker-1  |         {
+celery_worker-1  |           "text": "中研智创公司",
+celery_worker-1  |           "type": "公司名称简称",
+celery_worker-1  |           "is_primary": false
+celery_worker-1  |         },
+celery_worker-1  |         {
+celery_worker-1  |           "text": "中研智创",
+celery_worker-1  |           "type": "公司名称简称",
+celery_worker-1  |           "is_primary": false
+celery_worker-1  |         }
+celery_worker-1  |       ]
+celery_worker-1  |     }
+celery_worker-1  |   ]
+celery_worker-1  | }
+celery_worker-1  | [2025-07-14 14:21:21,437: INFO/ForkPoolWorker-4] Parsed entity linkage: {'entity_groups': [{'group_id': 'group_1', 'group_type': '公司名称', 'entities': [{'text': '北京丰复久信营销科技有限公司', 'type': '公司名称', 'is_primary': True}, {'text': '丰复久信公司', 'type': '公司名称简称', 'is_primary': False}, {'text': '丰复久信', 'type': '公司名称简称', 'is_primary': False}]}, {'group_id': 'group_2', 'group_type': '公司名称', 'entities': [{'text': '中研智创区块链技术有限公司', 'type': '公司名称', 'is_primary': True}, {'text': '中研智创公司', 'type': '公司名称简称', 'is_primary': False}, {'text': '中研智创', 'type': '公司名称简称', 'is_primary': False}]}]}
+celery_worker-1  | [2025-07-14 14:21:21,445: INFO/ForkPoolWorker-4] Successfully created entity linkage with 2 groups
+celery_worker-1  | [2025-07-14 14:21:21,445: INFO/ForkPoolWorker-4] Entity linkage: {'entity_groups': [{'group_id': 'group_1', 'group_type': '公司名称', 'entities': [{'text': '北京丰复久信营销科技有限公司', 'type': '公司名称', 'is_primary': True}, {'text': '丰复久信公司', 'type': '公司名称简称', 'is_primary': False}, {'text': '丰复久信', 'type': '公司名称简称', 'is_primary': False}]}, {'group_id': 'group_2', 'group_type': '公司名称', 'entities': [{'text': '中研智创区块链技术有限公司', 'type': '公司名称', 'is_primary': True}, {'text': '中研智创公司', 'type': '公司名称简称', 'is_primary': False}, {'text': '中研智创', 'type': '公司名称简称', 'is_primary': False}]}]}
+celery_worker-1  | [2025-07-14 14:21:21,446: INFO/ForkPoolWorker-4] Generated masked mapping for 22 entities
+celery_worker-1  | [2025-07-14 14:21:21,446: INFO/ForkPoolWorker-4] Combined mapping: {'郭东军': '某', '王欢子': '某甲', '北京丰复久信营销科技有限公司': '某公司', '丰复久信公司': '某公司甲', '中研智创区块链技术有限公司': '某公司乙', '中研智才公司': '某公司丙', '北京市海淀区北小马厂6 号1 号楼华天大厦1306 室': '某乙', '天津市津南区双港镇工业园区优谷产业园5 号楼-1505': '某丙', '服务合同': '某丁', '(2022)京 03 民终 3852 号': '某戊', '（2020）京0105 民初69754 号': '某己', '李圣艳': '某庚', '闫向东': '某辛', '李敏': '某壬', '布兰登·斯密特': '某癸', '中研智创公司': '某公司丁', '丰复久信': '某公司戊', '中研智创': '某公司己', '上海市': '某11', '北京': '某12', '《计算机设备采购合同》': '某13', '《服务合同书》': '某14'}
+celery_worker-1  | [2025-07-14 14:21:21,446: INFO/ForkPoolWorker-4] Linked entity '北京丰复久信营销科技有限公司' to '北京丰复久信营销科技有限公司' with masked name '某公司'
+celery_worker-1  | [2025-07-14 14:21:21,446: INFO/ForkPoolWorker-4] Linked entity '丰复久信公司' to '北京丰复久信营销科技有限公司' with masked name '某公司'
+celery_worker-1  | [2025-07-14 14:21:21,446: INFO/ForkPoolWorker-4] Linked entity '丰复久信' to '北京丰复久信营销科技有限公司' with masked name '某公司'
+celery_worker-1  | [2025-07-14 14:21:21,446: INFO/ForkPoolWorker-4] Linked entity '中研智创区块链技术有限公司' to '中研智创区块链技术有限公司' with masked name '某公司乙'
+celery_worker-1  | [2025-07-14 14:21:21,446: INFO/ForkPoolWorker-4] Linked entity '中研智创公司' to '中研智创区块链技术有限公司' with masked name '某公司乙'
+celery_worker-1  | [2025-07-14 14:21:21,446: INFO/ForkPoolWorker-4] Linked entity '中研智创' to '中研智创区块链技术有限公司' with masked name '某公司乙'
+celery_worker-1  | [2025-07-14 14:21:21,446: INFO/ForkPoolWorker-4] Final mapping: {'郭东军': '某', '王欢子': '某甲', '北京丰复久信营销科技有限公司': '某公司', '丰复久信公司': '某公司', '中研智创区块链技术有限公司': '某公司乙', '中研智才公司': '某公司丙', '北京市海淀区北小马厂6 号1 号楼华天大厦1306 室': '某乙', '天津市津南区双港镇工业园区优谷产业园5 号楼-1505': '某丙', '服务合同': '某丁', '(2022)京 03 民终 3852 号': '某戊', '（2020）京0105 民初69754 号': '某己', '李圣艳': '某庚', '闫向东': '某辛', '李敏': '某壬', '布兰登·斯密特': '某癸', '中研智创公司': '某公司乙', '丰复久信': '某公司', '中研智创': '某公司乙', '上海市': '某11', '北京': '某12', '《计算机设备采购合同》': '某13', '《服务合同书》': '某14'}
+celery_worker-1  | [2025-07-14 14:21:21,446: INFO/ForkPoolWorker-4] Successfully masked content
+celery_worker-1  | [2025-07-14 14:21:21,449: INFO/ForkPoolWorker-4] Successfully saved masked content to /app/storage/processed/47522ea9-c259-4304-bfe4-1d3ed6902ede.md
+celery_worker-1  | [2025-07-14 14:21:21,470: INFO/ForkPoolWorker-4] Task app.services.file_service.process_file[5cfbca4c-0f6f-4c71-a66b-b22ee2d28139] succeeded in 311.847165101s: None
+api-1            | INFO:     192.168.65.1:33432 - "GET /api/v1/files/files HTTP/1.1" 200 OK
+api-1            | INFO:     192.168.65.1:40073 - "GET /api/v1/files/files HTTP/1.1" 200 OK
+api-1            | INFO:     192.168.65.1:29550 - "GET /api/v1/files/files HTTP/1.1" 200 OK
+api-1            | INFO:     192.168.65.1:61350 - "GET /api/v1/files/files HTTP/1.1" 200 OK
+api-1            | INFO:     192.168.65.1:61755 - "GET /api/v1/files/files HTTP/1.1" 200 OK
+api-1            | INFO:     192.168.65.1:63726 - "GET /api/v1/files/files HTTP/1.1" 200 OK
+api-1            | INFO:     192.168.65.1:43446 - "GET /api/v1/files/files HTTP/1.1" 200 OK
+api-1            | INFO:     192.168.65.1:45624 - "GET /api/v1/files/files HTTP/1.1" 200 OK
+api-1            | INFO:     192.168.65.1:25256 - "GET /api/v1/files/files HTTP/1.1" 200 OK
+api-1            | INFO:     192.168.65.1:43464 - "GET /api/v1/files/files HTTP/1.1" 200 OK
--- a/backend/tests/test.txt
+++ b/backend/tests/test.txt
--- a/backend/tests/test_ner_processor.py
+++ b/backend/tests/test_ner_processor.py
@ -0,0 +1,62 @@
+import pytest
+from app.core.document_handlers.ner_processor import NerProcessor
+
+def test_generate_masked_mapping():
+    processor = NerProcessor()
+    unique_entities = [
+        {'text': '李雷', 'type': '人名'},
+        {'text': '李明', 'type': '人名'},
+        {'text': '王强', 'type': '人名'},
+        {'text': 'Acme Manufacturing Inc.', 'type': '英文公司名', 'industry': 'manufacturing'},
+        {'text': 'Google LLC', 'type': '英文公司名'},
+        {'text': 'A公司', 'type': '公司名称'},
+        {'text': 'B公司', 'type': '公司名称'},
+        {'text': 'John Smith', 'type': '英文人名'},
+        {'text': 'Elizabeth Windsor', 'type': '英文人名'},
+        {'text': '华梦龙光伏项目', 'type': '项目名'},
+        {'text': '案号12345', 'type': '案号'},
+        {'text': '310101198802080000', 'type': '身份证号'},
+        {'text': '9133021276453538XT', 'type': '社会信用代码'},
+    ]
+    linkage = {
+        'entity_groups': [
+            {
+                'group_id': 'g1',
+                'group_type': '公司名称',
+                'entities': [
+                    {'text': 'A公司', 'type': '公司名称', 'is_primary': True},
+                    {'text': 'B公司', 'type': '公司名称', 'is_primary': False},
+                ]
+            },
+            {
+                'group_id': 'g2',
+                'group_type': '人名',
+                'entities': [
+                    {'text': '李雷', 'type': '人名', 'is_primary': True},
+                    {'text': '李明', 'type': '人名', 'is_primary': False},
+                ]
+            }
+        ]
+    }
+    mapping = processor._generate_masked_mapping(unique_entities, linkage)
+    # 人名
+    assert mapping['李雷'].startswith('李某')
+    assert mapping['李明'].startswith('李某')
+    assert mapping['王强'].startswith('王某')
+    # 英文公司名
+    assert mapping['Acme Manufacturing Inc.'] == 'MANUFACTURING'
+    assert mapping['Google LLC'] == 'COMPANY'
+    # 公司名同组
+    assert mapping['A公司'] == mapping['B公司']
+    assert mapping['A公司'].endswith('公司')
+    # 英文人名
+    assert mapping['John Smith'] == 'J*** S***'
+    assert mapping['Elizabeth Windsor'] == 'E*** W***'
+    # 项目名
+    assert mapping['华梦龙光伏项目'].endswith('项目')
+    # 案号
+    assert mapping['案号12345'] == '***'
+    # 身份证号
+    assert mapping['310101198802080000'] == 'XXXXXX'
+    # 社会信用代码
+    assert mapping['9133021276453538XT'] == 'XXXXXXXX'