feature-ner-keyword-detect #1
|
|
@ -1,6 +1,6 @@
|
||||||
from abc import ABC, abstractmethod
|
from abc import ABC, abstractmethod
|
||||||
from typing import Any, Dict
|
from typing import Any, Dict
|
||||||
from ..prompts.masking_prompts import get_masking_mapping_prompt_v2, get_ner_name_prompt
|
from ..prompts.masking_prompts import get_ner_name_prompt, get_ner_company_prompt
|
||||||
import logging
|
import logging
|
||||||
import json
|
import json
|
||||||
from ..services.ollama_client import OllamaClient
|
from ..services.ollama_client import OllamaClient
|
||||||
|
|
@ -73,8 +73,10 @@ class DocumentProcessor(ABC):
|
||||||
|
|
||||||
return True
|
return True
|
||||||
|
|
||||||
def _build_mapping(self, chunk: str) -> Dict[str, str]:
|
def _build_mapping(self, chunk: str) -> list[Dict[str, str]]:
|
||||||
"""Build mapping for a single chunk of text with retry logic"""
|
"""Build mapping for a single chunk of text with retry logic"""
|
||||||
|
mapping_pipeline = []
|
||||||
|
# Build people name mapping
|
||||||
for attempt in range(self.max_retries):
|
for attempt in range(self.max_retries):
|
||||||
try:
|
try:
|
||||||
formatted_prompt = get_ner_name_prompt(chunk)
|
formatted_prompt = get_ner_name_prompt(chunk)
|
||||||
|
|
@ -87,7 +89,8 @@ class DocumentProcessor(ABC):
|
||||||
logger.info(f"Parsed mapping: {mapping}")
|
logger.info(f"Parsed mapping: {mapping}")
|
||||||
|
|
||||||
if mapping and self._validate_mapping_format(mapping):
|
if mapping and self._validate_mapping_format(mapping):
|
||||||
return mapping
|
mapping_pipeline.append(mapping)
|
||||||
|
break
|
||||||
else:
|
else:
|
||||||
logger.warning(f"Invalid mapping format received on attempt {attempt + 1}, retrying...")
|
logger.warning(f"Invalid mapping format received on attempt {attempt + 1}, retrying...")
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
|
|
@ -97,7 +100,31 @@ class DocumentProcessor(ABC):
|
||||||
else:
|
else:
|
||||||
logger.error("Max retries reached, returning empty mapping")
|
logger.error("Max retries reached, returning empty mapping")
|
||||||
return {}
|
return {}
|
||||||
|
|
||||||
|
# Build company name mapping
|
||||||
|
for attempt in range(self.max_retries):
|
||||||
|
try:
|
||||||
|
formatted_prompt = get_ner_company_prompt(chunk)
|
||||||
|
logger.info(f"Calling ollama to generate mapping for chunk (attempt {attempt + 1}/{self.max_retries}): {formatted_prompt}")
|
||||||
|
response = self.ollama_client.generate(formatted_prompt)
|
||||||
|
logger.info(f"Raw response from LLM: {response}")
|
||||||
|
mapping = LLMJsonExtractor.parse_raw_json_str(response)
|
||||||
|
logger.info(f"Parsed mapping: {mapping}")
|
||||||
|
|
||||||
|
if mapping and self._validate_mapping_format(mapping):
|
||||||
|
mapping_pipeline.append(mapping)
|
||||||
|
break
|
||||||
|
else:
|
||||||
|
logger.warning(f"Invalid mapping format received on attempt {attempt + 1}, retrying...")
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Error generating mapping on attempt {attempt + 1}: {e}")
|
||||||
|
if attempt < self.max_retries - 1:
|
||||||
|
logger.info("Retrying...")
|
||||||
|
else:
|
||||||
|
logger.error("Max retries reached, returning empty mapping")
|
||||||
|
return {}
|
||||||
|
return mapping_pipeline
|
||||||
|
|
||||||
def _apply_mapping(self, text: str, mapping: Dict[str, str]) -> str:
|
def _apply_mapping(self, text: str, mapping: Dict[str, str]) -> str:
|
||||||
"""Apply the mapping to replace sensitive information"""
|
"""Apply the mapping to replace sensitive information"""
|
||||||
masked_text = text
|
masked_text = text
|
||||||
|
|
|
||||||
|
|
@ -1,7 +1,7 @@
|
||||||
from ...document_handlers.document_processor import DocumentProcessor
|
from ...document_handlers.document_processor import DocumentProcessor
|
||||||
from ...services.ollama_client import OllamaClient
|
from ...services.ollama_client import OllamaClient
|
||||||
import logging
|
import logging
|
||||||
from ...prompts.masking_prompts import get_masking_prompt
|
# from ...prompts.masking_prompts import get_masking_prompt
|
||||||
from ...config import settings
|
from ...config import settings
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
|
||||||
|
|
@ -1,132 +1,5 @@
|
||||||
import textwrap
|
import textwrap
|
||||||
|
|
||||||
def get_masking_prompt(text: str) -> str:
|
|
||||||
"""
|
|
||||||
Returns the prompt for masking sensitive information in legal documents.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
text (str): The input text to be masked
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
str: The formatted prompt with the input text
|
|
||||||
"""
|
|
||||||
prompt = textwrap.dedent("""
|
|
||||||
您是一位专业的法律文档脱敏专家。请按照以下规则对文本进行脱敏处理:
|
|
||||||
|
|
||||||
规则:
|
|
||||||
1. 人名:
|
|
||||||
- 两字名改为"姓+某"(如:张三 → 张某)
|
|
||||||
- 三字名改为"姓+某某"(如:张三丰 → 张某某)
|
|
||||||
2. 公司名:
|
|
||||||
- 保留地理位置信息(如:北京、上海等)
|
|
||||||
- 保留公司类型(如:有限公司、股份公司等)
|
|
||||||
- 用"某"替换核心名称
|
|
||||||
3. 保持原文其他部分不变
|
|
||||||
4. 确保脱敏后的文本保持原有的语言流畅性和可读性
|
|
||||||
|
|
||||||
输入文本:
|
|
||||||
{text}
|
|
||||||
|
|
||||||
请直接输出脱敏后的文本,无需解释或其他备注。
|
|
||||||
""")
|
|
||||||
|
|
||||||
return prompt.format(text=text)
|
|
||||||
|
|
||||||
def get_masking_mapping_prompt(text: str) -> str:
|
|
||||||
"""
|
|
||||||
Returns a prompt that generates a mapping of original names/companies to their masked versions.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
text (str): The input text to be analyzed for masking
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
str: The formatted prompt that will generate a mapping dictionary
|
|
||||||
"""
|
|
||||||
prompt = textwrap.dedent("""
|
|
||||||
您是一位专业的法律文档脱敏专家。请分析文本并生成一个脱敏映射表,遵循以下规则:
|
|
||||||
|
|
||||||
规则:
|
|
||||||
1. 人名映射规则:
|
|
||||||
- 对于同一姓氏的不同人名,使用字母区分:
|
|
||||||
* 第一个出现的用"姓+某"(如:张三 → 张某)
|
|
||||||
* 第二个出现的用"姓+某A"(如:张四 → 张某A)
|
|
||||||
* 第三个出现的用"姓+某B"(如:张五 → 张某B)
|
|
||||||
依此类推
|
|
||||||
- 三字名同样遵循此规则(如:张三丰 → 张某某,张四海 → 张某某A)
|
|
||||||
|
|
||||||
2. 公司名映射规则:
|
|
||||||
- 保留地理位置信息(如:北京、上海等)
|
|
||||||
- 保留公司类型(如:有限公司、股份公司等)
|
|
||||||
- 用"某"替换核心名称,但保留首尾字(如:北京智慧科技有限公司 → 北京智某科技有限公司)
|
|
||||||
- 对于多个相似公司名,使用字母区分(如:
|
|
||||||
北京智慧科技有限公司 → 北京某科技有限公司
|
|
||||||
北京智能科技有限公司 → 北京某科技有限公司A)
|
|
||||||
|
|
||||||
3. 公权机关不做脱敏处理(如:公安局、法院、检察院、中国人民银行、银监会及其他未列明的公权机关)
|
|
||||||
|
|
||||||
请分析以下文本,并生成一个JSON格式的映射表,包含所有需要脱敏的名称及其对应的脱敏后的形式:
|
|
||||||
|
|
||||||
{text}
|
|
||||||
|
|
||||||
请直接输出JSON格式的映射表,格式如下:
|
|
||||||
{{
|
|
||||||
"原文1": "脱敏后1",
|
|
||||||
"原文2": "脱敏后2",
|
|
||||||
...
|
|
||||||
}}
|
|
||||||
如无需要输出的映射,请输出空json,如下:
|
|
||||||
{{}}
|
|
||||||
""")
|
|
||||||
|
|
||||||
return prompt.format(text=text)
|
|
||||||
|
|
||||||
def get_masking_mapping_prompt_v2(text: str) -> str:
|
|
||||||
"""
|
|
||||||
Returns a prompt that generates a mapping of original names/companies to their masked versions.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
text (str): The input text to be analyzed for masking
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
str: The formatted prompt that will generate a mapping dictionary
|
|
||||||
"""
|
|
||||||
prompt = textwrap.dedent("""
|
|
||||||
请根据以下脱敏规则,对给定文本进行分析并生成脱敏映射map(JSON格式)。
|
|
||||||
|
|
||||||
脱敏规则说明:
|
|
||||||
|
|
||||||
1.人名:保留姓,名变为"某";同姓者按"某1"、"某2"依次编号;律师姓名、审判人员姓名不脱敏
|
|
||||||
2.公司名:替换为大写英文字母(A公司、B公司等);律师事务所不脱敏
|
|
||||||
3.地址:仅保留区级以上地址,删除详细位置
|
|
||||||
4.英文人名:保留姓名首字母,其余替换为"***"
|
|
||||||
5.英文公司名:替换为所属行业名称的英文大写形式
|
|
||||||
6.项目名:替换为小写英文字母(a项目、b项目等)
|
|
||||||
7.案号:具体案号部分替换为"** *"
|
|
||||||
8.身份证号:替换为6个"X"
|
|
||||||
9.统一社会信用代码:替换为8个"X"
|
|
||||||
输入文本:
|
|
||||||
{text}
|
|
||||||
|
|
||||||
输出要求:
|
|
||||||
请生成一个JSON格式的映射map,包含以下结构:
|
|
||||||
|
|
||||||
{{
|
|
||||||
"原文1": "脱敏后1",
|
|
||||||
"原文2": "脱敏后2",
|
|
||||||
...
|
|
||||||
}}
|
|
||||||
如无需要输出的映射,请输出空json,如下:
|
|
||||||
{{}}
|
|
||||||
注意事项:
|
|
||||||
|
|
||||||
请准确识别文本中的各类敏感信息
|
|
||||||
同一类别的多个实体,请在对应类别下全部列出
|
|
||||||
如果文本中没有某类敏感信息,可以省略该类别
|
|
||||||
请确保脱敏后的文本符合上述规则要求
|
|
||||||
|
|
||||||
|
|
||||||
""")
|
|
||||||
return prompt.format(text=text)
|
|
||||||
|
|
||||||
def get_ner_name_prompt(text: str) -> str:
|
def get_ner_name_prompt(text: str) -> str:
|
||||||
"""
|
"""
|
||||||
|
|
@ -152,12 +25,57 @@ def get_ner_name_prompt(text: str) -> str:
|
||||||
输出格式:
|
输出格式:
|
||||||
{{
|
{{
|
||||||
"entities": [
|
"entities": [
|
||||||
{{"text": "原始文本内容", "type": "实体类别"}},
|
{{"text": "原始文本内容", "type": "人名"}},
|
||||||
|
{{"text": "原始文本内容", "type": "英文人名"}},
|
||||||
|
...
|
||||||
|
]
|
||||||
|
}}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
请严格按照JSON格式输出结果。
|
||||||
|
|
||||||
|
""")
|
||||||
|
return prompt.format(text=text)
|
||||||
|
|
||||||
|
|
||||||
|
def get_ner_company_prompt(text: str) -> str:
|
||||||
|
"""
|
||||||
|
Returns a prompt that generates a mapping of original companies to their masked versions.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
text (str): The input text to be analyzed for masking
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
str: The formatted prompt that will generate a mapping dictionary
|
||||||
|
"""
|
||||||
|
prompt = textwrap.dedent("""
|
||||||
|
你是一个专业的法律文本实体识别助手。请从以下文本中抽取出所有需要脱敏的敏感信息,并按照指定的类别进行分类。请严格按照JSON格式输出结果。
|
||||||
|
|
||||||
|
实体类别包括:
|
||||||
|
- 公司名称
|
||||||
|
- 英文公司名称
|
||||||
|
- Company with English name
|
||||||
|
- 公司名称简称
|
||||||
|
- 公司英文名称简称
|
||||||
|
|
||||||
|
|
||||||
|
待处理文本:
|
||||||
|
{text}
|
||||||
|
|
||||||
|
输出格式:
|
||||||
|
{{
|
||||||
|
"entities": [
|
||||||
|
{{"text": "原始文本内容", "type": "公司名称"}},
|
||||||
|
{{"text": "原始文本内容", "type": "英文公司名称"}},
|
||||||
|
{{"text": "原始文本内容", "type": "公司名称简称"}},
|
||||||
|
{{"text": "原始文本内容", "type": "公司英文名称简称"}},
|
||||||
...
|
...
|
||||||
]
|
]
|
||||||
}}
|
}}
|
||||||
|
|
||||||
请严格按照JSON格式输出结果。
|
请严格按照JSON格式输出结果。
|
||||||
|
|
||||||
""")
|
""")
|
||||||
return prompt.format(text=text)
|
return prompt.format(text=text)
|
||||||
|
|
||||||
|
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue