feat: Enhance NER processing by adding company name mapping and refactoring prompt functions

2025-06-27 00:39:38 +08:00 · 2025-06-27 00:39:38 +08:00 · 5b1b8f8e9c
parent 5ddef90e8b
commit 5b1b8f8e9c
3 changed files with 80 additions and 135 deletions
--- a/backend/app/core/document_handlers/document_processor.py
+++ b/backend/app/core/document_handlers/document_processor.py
@ -1,6 +1,6 @@
 from abc import ABC, abstractmethod
 from typing import Any, Dict
-from ..prompts.masking_prompts import get_masking_mapping_prompt_v2, get_ner_name_prompt
+from ..prompts.masking_prompts import get_ner_name_prompt, get_ner_company_prompt
 import logging
 import json
 from ..services.ollama_client import OllamaClient
@ -73,8 +73,10 @@ class DocumentProcessor(ABC):
            
        return True

-    def _build_mapping(self, chunk: str) -> Dict[str, str]:
+    def _build_mapping(self, chunk: str) -> list[Dict[str, str]]:
        """Build mapping for a single chunk of text with retry logic"""
+        mapping_pipeline = []
+        # Build people name mapping
        for attempt in range(self.max_retries):
            try:
                formatted_prompt = get_ner_name_prompt(chunk)
@ -87,7 +89,8 @@ class DocumentProcessor(ABC):
                logger.info(f"Parsed mapping: {mapping}")
                
                if mapping and self._validate_mapping_format(mapping):
-                    return mapping
+                    mapping_pipeline.append(mapping)
+                    break
                else:
                    logger.warning(f"Invalid mapping format received on attempt {attempt + 1}, retrying...")
            except Exception as e:
@ -97,7 +100,31 @@ class DocumentProcessor(ABC):
                else:
                    logger.error("Max retries reached, returning empty mapping")
                    return {}
-
+                
+        # Build company name mapping
+        for attempt in range(self.max_retries):
+            try:
+                formatted_prompt = get_ner_company_prompt(chunk)
+                logger.info(f"Calling ollama to generate mapping for chunk (attempt {attempt + 1}/{self.max_retries}): {formatted_prompt}")
+                response = self.ollama_client.generate(formatted_prompt)
+                logger.info(f"Raw response from LLM: {response}")
+                mapping = LLMJsonExtractor.parse_raw_json_str(response)
+                logger.info(f"Parsed mapping: {mapping}")
+                
+                if mapping and self._validate_mapping_format(mapping):
+                    mapping_pipeline.append(mapping)
+                    break
+                else:
+                    logger.warning(f"Invalid mapping format received on attempt {attempt + 1}, retrying...")
+            except Exception as e:
+                logger.error(f"Error generating mapping on attempt {attempt + 1}: {e}")
+                if attempt < self.max_retries - 1:
+                    logger.info("Retrying...")
+                else:
+                    logger.error("Max retries reached, returning empty mapping")
+                    return {}
+        return mapping_pipeline
+    
    def _apply_mapping(self, text: str, mapping: Dict[str, str]) -> str:
        """Apply the mapping to replace sensitive information"""
        masked_text = text
--- a/backend/app/core/document_handlers/processors/txt_processor.py
+++ b/backend/app/core/document_handlers/processors/txt_processor.py
@ -1,7 +1,7 @@
 from ...document_handlers.document_processor import DocumentProcessor
 from ...services.ollama_client import OllamaClient
 import logging
-from ...prompts.masking_prompts import get_masking_prompt
+# from ...prompts.masking_prompts import get_masking_prompt
 from ...config import settings

 logger = logging.getLogger(__name__)
--- a/backend/app/core/prompts/masking_prompts.py
+++ b/backend/app/core/prompts/masking_prompts.py
@ -1,132 +1,5 @@
 import textwrap

-def get_masking_prompt(text: str) -> str:
-    """
-    Returns the prompt for masking sensitive information in legal documents.
-    
-    Args:
-        text (str): The input text to be masked
-        
-    Returns:
-        str: The formatted prompt with the input text
-    """
-    prompt = textwrap.dedent("""
-        您是一位专业的法律文档脱敏专家。请按照以下规则对文本进行脱敏处理：
-
-        规则：
-        1. 人名：
-           - 两字名改为"姓+某"（如：张三 → 张某）
-           - 三字名改为"姓+某某"（如：张三丰 → 张某某）
-        2. 公司名：
-           - 保留地理位置信息（如：北京、上海等）
-           - 保留公司类型（如：有限公司、股份公司等）
-           - 用"某"替换核心名称
-        3. 保持原文其他部分不变
-        4. 确保脱敏后的文本保持原有的语言流畅性和可读性
-
-        输入文本：
-        {text}
-
-        请直接输出脱敏后的文本，无需解释或其他备注。
-    """)
-    
-    return prompt.format(text=text)
-
-def get_masking_mapping_prompt(text: str) -> str:
-    """
-    Returns a prompt that generates a mapping of original names/companies to their masked versions.
-    
-    Args:
-        text (str): The input text to be analyzed for masking
-        
-    Returns:
-        str: The formatted prompt that will generate a mapping dictionary
-    """
-    prompt = textwrap.dedent("""
-        您是一位专业的法律文档脱敏专家。请分析文本并生成一个脱敏映射表，遵循以下规则：
-
-        规则：
-        1. 人名映射规则：
-           - 对于同一姓氏的不同人名，使用字母区分：
-             * 第一个出现的用"姓+某"（如：张三 → 张某）
-             * 第二个出现的用"姓+某A"（如：张四 → 张某A）
-             * 第三个出现的用"姓+某B"（如：张五 → 张某B）
-             依此类推
-           - 三字名同样遵循此规则（如：张三丰 → 张某某，张四海 → 张某某A）
-           
-        2. 公司名映射规则：
-           - 保留地理位置信息（如：北京、上海等）
-           - 保留公司类型（如：有限公司、股份公司等）
-           - 用"某"替换核心名称,但保留首尾字(如：北京智慧科技有限公司 → 北京智某科技有限公司)
-           - 对于多个相似公司名，使用字母区分（如：
-             北京智慧科技有限公司 → 北京某科技有限公司
-             北京智能科技有限公司 → 北京某科技有限公司A）
-        
-        3. 公权机关不做脱敏处理（如：公安局、法院、检察院、中国人民银行、银监会及其他未列明的公权机关）
-
-        请分析以下文本，并生成一个JSON格式的映射表，包含所有需要脱敏的名称及其对应的脱敏后的形式：
-
-        {text}
-
-        请直接输出JSON格式的映射表，格式如下：
-        {{
-            "原文1": "脱敏后1",
-            "原文2": "脱敏后2",
-            ...
-        }}
-        如无需要输出的映射，请输出空json，如下:
-        {{}}
-    """)
-    
-    return prompt.format(text=text)
-
-def get_masking_mapping_prompt_v2(text: str) -> str:
-    """
-    Returns a prompt that generates a mapping of original names/companies to their masked versions.
-    
-    Args:
-        text (str): The input text to be analyzed for masking
-        
-    Returns:
-        str: The formatted prompt that will generate a mapping dictionary
-    """
-    prompt = textwrap.dedent("""
-请根据以下脱敏规则，对给定文本进行分析并生成脱敏映射map（JSON格式）。
-
-脱敏规则说明：
-
-1.人名：保留姓，名变为"某"；同姓者按"某1"、"某2"依次编号；律师姓名、审判人员姓名不脱敏
-2.公司名：替换为大写英文字母（A公司、B公司等）；律师事务所不脱敏
-3.地址：仅保留区级以上地址，删除详细位置
-4.英文人名：保留姓名首字母，其余替换为"***"
-5.英文公司名：替换为所属行业名称的英文大写形式
-6.项目名：替换为小写英文字母（a项目、b项目等）
-7.案号：具体案号部分替换为"** *"
-8.身份证号：替换为6个"X"
-9.统一社会信用代码：替换为8个"X"
-输入文本：
-{text}
-
-输出要求：
-请生成一个JSON格式的映射map，包含以下结构：
-
-{{
-"原文1": "脱敏后1",
-"原文2": "脱敏后2",
- ...  
-}}
-如无需要输出的映射，请输出空json，如下:
-{{}}
-注意事项：
-
-请准确识别文本中的各类敏感信息
-同一类别的多个实体，请在对应类别下全部列出
-如果文本中没有某类敏感信息，可以省略该类别
-请确保脱敏后的文本符合上述规则要求
-
-
-""")
-    return prompt.format(text=text)

 def get_ner_name_prompt(text: str) -> str:
    """
@ -152,12 +25,57 @@ def get_ner_name_prompt(text: str) -> str:
 输出格式:
 {{
 "entities": [
-    {{"text": "原始文本内容", "type": "实体类别"}},
+    {{"text": "原始文本内容", "type": "人名"}},
+    {{"text": "原始文本内容", "type": "英文人名"}},
+    ...
+  ]
+}}
+
+
+
+请严格按照JSON格式输出结果。
+    
+    """)
+    return prompt.format(text=text)
+
+
+def get_ner_company_prompt(text: str) -> str:
+    """
+    Returns a prompt that generates a mapping of original companies to their masked versions.
+    
+    Args:
+        text (str): The input text to be analyzed for masking
+        
+    Returns:
+        str: The formatted prompt that will generate a mapping dictionary
+    """
+    prompt = textwrap.dedent("""
+你是一个专业的法律文本实体识别助手。请从以下文本中抽取出所有需要脱敏的敏感信息，并按照指定的类别进行分类。请严格按照JSON格式输出结果。
+
+实体类别包括:
+- 公司名称
+- 英文公司名称
+- Company with English name
+- 公司名称简称
+- 公司英文名称简称
+
+
+待处理文本:
+{text}  
+
+输出格式:
+{{
+"entities": [
+    {{"text": "原始文本内容", "type": "公司名称"}},
+    {{"text": "原始文本内容", "type": "英文公司名称"}},
+    {{"text": "原始文本内容", "type": "公司名称简称"}},
+    {{"text": "原始文本内容", "type": "公司英文名称简称"}},
    ...
  ]
 }}

 请严格按照JSON格式输出结果。
-    
    """)
-    return prompt.format(text=text)
+    return prompt.format(text=text)
+
+