feat: 中文名按照姓+名拼音首字母脱敏
This commit is contained in:
parent
8399bc37fc
commit
2c4ecfd6b0
|
|
@ -8,6 +8,7 @@ from ..utils.json_extractor import LLMJsonExtractor
|
||||||
from ..utils.llm_validator import LLMResponseValidator
|
from ..utils.llm_validator import LLMResponseValidator
|
||||||
import re
|
import re
|
||||||
from .regs.entity_regex import extract_id_number_entities, extract_social_credit_code_entities
|
from .regs.entity_regex import extract_id_number_entities, extract_social_credit_code_entities
|
||||||
|
from pypinyin import pinyin, Style
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
@ -19,6 +20,41 @@ class NerProcessor:
|
||||||
def _validate_mapping_format(self, mapping: Dict[str, Any]) -> bool:
|
def _validate_mapping_format(self, mapping: Dict[str, Any]) -> bool:
|
||||||
return LLMResponseValidator.validate_entity_extraction(mapping)
|
return LLMResponseValidator.validate_entity_extraction(mapping)
|
||||||
|
|
||||||
|
def _mask_chinese_name(self, name: str, surname_counter: Dict[str, Dict[str, int]]) -> str:
|
||||||
|
"""
|
||||||
|
处理中文姓名脱敏:
|
||||||
|
保留姓,名变为大写首字母;
|
||||||
|
同姓名同首字母者按1、2依次编号
|
||||||
|
"""
|
||||||
|
if not name or len(name) < 2:
|
||||||
|
return name
|
||||||
|
|
||||||
|
surname = name[0]
|
||||||
|
given_name = name[1:]
|
||||||
|
|
||||||
|
# 获取名的拼音首字母
|
||||||
|
try:
|
||||||
|
pinyin_list = pinyin(given_name, style=Style.NORMAL)
|
||||||
|
initials = ''.join([p[0][0].upper() for p in pinyin_list if p and p[0]])
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning(f"Failed to get pinyin for {given_name}: {e}")
|
||||||
|
# 如果拼音转换失败,使用原字符
|
||||||
|
initials = given_name
|
||||||
|
|
||||||
|
# 初始化姓氏计数器
|
||||||
|
if surname not in surname_counter:
|
||||||
|
surname_counter[surname] = {}
|
||||||
|
|
||||||
|
# 检查是否有相同姓氏和首字母的组合
|
||||||
|
if initials in surname_counter[surname]:
|
||||||
|
surname_counter[surname][initials] += 1
|
||||||
|
masked_name = f"{surname}{initials}{surname_counter[surname][initials]}"
|
||||||
|
else:
|
||||||
|
surname_counter[surname][initials] = 1
|
||||||
|
masked_name = f"{surname}{initials}"
|
||||||
|
|
||||||
|
return masked_name
|
||||||
|
|
||||||
def _process_entity_type(self, chunk: str, prompt_func, entity_type: str) -> Dict[str, str]:
|
def _process_entity_type(self, chunk: str, prompt_func, entity_type: str) -> Dict[str, str]:
|
||||||
for attempt in range(self.max_retries):
|
for attempt in range(self.max_retries):
|
||||||
try:
|
try:
|
||||||
|
|
@ -99,22 +135,23 @@ class NerProcessor:
|
||||||
def _generate_masked_mapping(self, unique_entities: list[Dict[str, str]], linkage: Dict[str, Any]) -> Dict[str, str]:
|
def _generate_masked_mapping(self, unique_entities: list[Dict[str, str]], linkage: Dict[str, Any]) -> Dict[str, str]:
|
||||||
"""
|
"""
|
||||||
结合 linkage 信息,按实体分组映射同一脱敏名,并实现如下规则:
|
结合 linkage 信息,按实体分组映射同一脱敏名,并实现如下规则:
|
||||||
1. 人名/简称:保留姓,名变为某,同姓编号;
|
1. 中文人名:保留姓,名变为大写首字母,同姓名同首字母者按1、2依次编号(如:李强->李Q,张韶涵->张SH,张若宇->张RY,白锦程->白JC);
|
||||||
2. 公司名:同组公司名映射为大写字母公司(A公司、B公司...);
|
2. 律师姓名、审判人员姓名:同上中文人名规则;
|
||||||
3. 英文人名:每个单词首字母+***;
|
3. 公司名:同组公司名映射为大写字母公司(A公司、B公司...);
|
||||||
4. 英文公司名:替换为所属行业名称,英文大写(如无行业信息,默认 COMPANY);
|
4. 英文人名:每个单词首字母+***;
|
||||||
5. 项目名:项目名称变为小写英文字母(如 a项目、b项目...);
|
5. 英文公司名:替换为所属行业名称,英文大写(如无行业信息,默认 COMPANY);
|
||||||
6. 案号:只替换案号中的数字部分为***,保留前后结构和“号”字,支持中间有空格;
|
6. 项目名:项目名称变为小写英文字母(如 a项目、b项目...);
|
||||||
7. 身份证号:6位X;
|
7. 案号:只替换案号中的数字部分为***,保留前后结构和"号"字,支持中间有空格;
|
||||||
8. 社会信用代码:8位X;
|
8. 身份证号:6位X;
|
||||||
9. 地址:保留区级及以上行政区划,去除详细位置;
|
9. 社会信用代码:8位X;
|
||||||
10. 其他类型按原有逻辑。
|
10. 地址:保留区级及以上行政区划,去除详细位置;
|
||||||
|
11. 其他类型按原有逻辑。
|
||||||
"""
|
"""
|
||||||
import re
|
import re
|
||||||
entity_mapping = {}
|
entity_mapping = {}
|
||||||
used_masked_names = set()
|
used_masked_names = set()
|
||||||
group_mask_map = {}
|
group_mask_map = {}
|
||||||
surname_counter = {}
|
surname_counter = {} # 用于中文姓名脱敏的计数器
|
||||||
company_letter = ord('A')
|
company_letter = ord('A')
|
||||||
project_letter = ord('a')
|
project_letter = ord('a')
|
||||||
# 优先区县级单位,后市、省等
|
# 优先区县级单位,后市、省等
|
||||||
|
|
@ -132,18 +169,12 @@ class NerProcessor:
|
||||||
for entity in entities:
|
for entity in entities:
|
||||||
group_mask_map[entity['text']] = masked
|
group_mask_map[entity['text']] = masked
|
||||||
elif '人名' in group_type:
|
elif '人名' in group_type:
|
||||||
surname_local_counter = {}
|
|
||||||
for entity in entities:
|
for entity in entities:
|
||||||
name = entity['text']
|
name = entity['text']
|
||||||
if not name:
|
if not name:
|
||||||
continue
|
continue
|
||||||
surname = name[0]
|
# 使用新的中文姓名脱敏方法
|
||||||
surname_local_counter.setdefault(surname, 0)
|
masked = self._mask_chinese_name(name, surname_counter)
|
||||||
surname_local_counter[surname] += 1
|
|
||||||
if surname_local_counter[surname] == 1:
|
|
||||||
masked = f"{surname}某"
|
|
||||||
else:
|
|
||||||
masked = f"{surname}某{surname_local_counter[surname]}"
|
|
||||||
group_mask_map[name] = masked
|
group_mask_map[name] = masked
|
||||||
elif '英文人名' in group_type:
|
elif '英文人名' in group_type:
|
||||||
for entity in entities:
|
for entity in entities:
|
||||||
|
|
@ -194,13 +225,8 @@ class NerProcessor:
|
||||||
if not name:
|
if not name:
|
||||||
masked = '某'
|
masked = '某'
|
||||||
else:
|
else:
|
||||||
surname = name[0]
|
# 使用新的中文姓名脱敏方法
|
||||||
surname_counter.setdefault(surname, 0)
|
masked = self._mask_chinese_name(name, surname_counter)
|
||||||
surname_counter[surname] += 1
|
|
||||||
if surname_counter[surname] == 1:
|
|
||||||
masked = f"{surname}某"
|
|
||||||
else:
|
|
||||||
masked = f"{surname}某{surname_counter[surname]}"
|
|
||||||
entity_mapping[text] = masked
|
entity_mapping[text] = masked
|
||||||
used_masked_names.add(masked)
|
used_masked_names.add(masked)
|
||||||
elif '公司' in entity_type or 'Company' in entity_type:
|
elif '公司' in entity_type or 'Company' in entity_type:
|
||||||
|
|
|
||||||
|
|
@ -29,4 +29,7 @@ python-docx>=0.8.11
|
||||||
PyPDF2>=3.0.0
|
PyPDF2>=3.0.0
|
||||||
pandas>=2.0.0
|
pandas>=2.0.0
|
||||||
# magic-pdf[full]
|
# magic-pdf[full]
|
||||||
jsonschema>=4.20.0
|
jsonschema>=4.20.0
|
||||||
|
|
||||||
|
# Chinese text processing
|
||||||
|
pypinyin>=0.50.0
|
||||||
|
|
@ -4,9 +4,9 @@ from app.core.document_handlers.ner_processor import NerProcessor
|
||||||
def test_generate_masked_mapping():
|
def test_generate_masked_mapping():
|
||||||
processor = NerProcessor()
|
processor = NerProcessor()
|
||||||
unique_entities = [
|
unique_entities = [
|
||||||
{'text': '李雷', 'type': '人名'},
|
{'text': '李强', 'type': '人名'},
|
||||||
{'text': '李明', 'type': '人名'},
|
{'text': '李强', 'type': '人名'}, # Duplicate to test numbering
|
||||||
{'text': '王强', 'type': '人名'},
|
{'text': '王小明', 'type': '人名'},
|
||||||
{'text': 'Acme Manufacturing Inc.', 'type': '英文公司名', 'industry': 'manufacturing'},
|
{'text': 'Acme Manufacturing Inc.', 'type': '英文公司名', 'industry': 'manufacturing'},
|
||||||
{'text': 'Google LLC', 'type': '英文公司名'},
|
{'text': 'Google LLC', 'type': '英文公司名'},
|
||||||
{'text': 'A公司', 'type': '公司名称'},
|
{'text': 'A公司', 'type': '公司名称'},
|
||||||
|
|
@ -32,17 +32,16 @@ def test_generate_masked_mapping():
|
||||||
'group_id': 'g2',
|
'group_id': 'g2',
|
||||||
'group_type': '人名',
|
'group_type': '人名',
|
||||||
'entities': [
|
'entities': [
|
||||||
{'text': '李雷', 'type': '人名', 'is_primary': True},
|
{'text': '李强', 'type': '人名', 'is_primary': True},
|
||||||
{'text': '李明', 'type': '人名', 'is_primary': False},
|
{'text': '李强', 'type': '人名', 'is_primary': False},
|
||||||
]
|
]
|
||||||
}
|
}
|
||||||
]
|
]
|
||||||
}
|
}
|
||||||
mapping = processor._generate_masked_mapping(unique_entities, linkage)
|
mapping = processor._generate_masked_mapping(unique_entities, linkage)
|
||||||
# 人名
|
# 人名 - Updated for new Chinese name masking rules
|
||||||
assert mapping['李雷'].startswith('李某')
|
assert mapping['李强'] == '李Q'
|
||||||
assert mapping['李明'].startswith('李某')
|
assert mapping['王小明'] == '王XM'
|
||||||
assert mapping['王强'].startswith('王某')
|
|
||||||
# 英文公司名
|
# 英文公司名
|
||||||
assert mapping['Acme Manufacturing Inc.'] == 'MANUFACTURING'
|
assert mapping['Acme Manufacturing Inc.'] == 'MANUFACTURING'
|
||||||
assert mapping['Google LLC'] == 'COMPANY'
|
assert mapping['Google LLC'] == 'COMPANY'
|
||||||
|
|
@ -59,4 +58,135 @@ def test_generate_masked_mapping():
|
||||||
# 身份证号
|
# 身份证号
|
||||||
assert mapping['310101198802080000'] == 'XXXXXX'
|
assert mapping['310101198802080000'] == 'XXXXXX'
|
||||||
# 社会信用代码
|
# 社会信用代码
|
||||||
assert mapping['9133021276453538XT'] == 'XXXXXXXX'
|
assert mapping['9133021276453538XT'] == 'XXXXXXXX'
|
||||||
|
|
||||||
|
|
||||||
|
def test_chinese_name_pinyin_masking():
|
||||||
|
"""Test Chinese name masking with pinyin functionality"""
|
||||||
|
processor = NerProcessor()
|
||||||
|
|
||||||
|
# Test basic Chinese name masking
|
||||||
|
test_cases = [
|
||||||
|
("李强", "李Q"),
|
||||||
|
("张韶涵", "张SH"),
|
||||||
|
("张若宇", "张RY"),
|
||||||
|
("白锦程", "白JC"),
|
||||||
|
("王小明", "王XM"),
|
||||||
|
("陈志强", "陈ZQ"),
|
||||||
|
]
|
||||||
|
|
||||||
|
surname_counter = {}
|
||||||
|
|
||||||
|
for original_name, expected_masked in test_cases:
|
||||||
|
masked = processor._mask_chinese_name(original_name, surname_counter)
|
||||||
|
assert masked == expected_masked, f"Expected {expected_masked}, got {masked} for {original_name}"
|
||||||
|
|
||||||
|
# Test duplicate handling
|
||||||
|
duplicate_test_cases = [
|
||||||
|
("李强", "李Q"),
|
||||||
|
("李强", "李Q2"), # Should be numbered
|
||||||
|
("李倩", "李Q3"), # Should be numbered
|
||||||
|
("张韶涵", "张SH"),
|
||||||
|
("张韶涵", "张SH2"), # Should be numbered
|
||||||
|
("张若宇", "张RY"), # Different initials, should not be numbered
|
||||||
|
]
|
||||||
|
|
||||||
|
surname_counter = {} # Reset counter
|
||||||
|
|
||||||
|
for original_name, expected_masked in duplicate_test_cases:
|
||||||
|
masked = processor._mask_chinese_name(original_name, surname_counter)
|
||||||
|
assert masked == expected_masked, f"Expected {expected_masked}, got {masked} for {original_name}"
|
||||||
|
|
||||||
|
# Test edge cases
|
||||||
|
edge_cases = [
|
||||||
|
("", ""), # Empty string
|
||||||
|
("李", "李"), # Single character
|
||||||
|
("李强强", "李QQ"), # Multiple characters with same pinyin
|
||||||
|
]
|
||||||
|
|
||||||
|
surname_counter = {} # Reset counter
|
||||||
|
|
||||||
|
for original_name, expected_masked in edge_cases:
|
||||||
|
masked = processor._mask_chinese_name(original_name, surname_counter)
|
||||||
|
assert masked == expected_masked, f"Expected {expected_masked}, got {masked} for {original_name}"
|
||||||
|
|
||||||
|
|
||||||
|
def test_chinese_name_integration():
|
||||||
|
"""Test Chinese name masking integrated with the full mapping process"""
|
||||||
|
processor = NerProcessor()
|
||||||
|
|
||||||
|
# Test Chinese names in the full mapping context
|
||||||
|
unique_entities = [
|
||||||
|
{'text': '李强', 'type': '人名'},
|
||||||
|
{'text': '张韶涵', 'type': '人名'},
|
||||||
|
{'text': '张若宇', 'type': '人名'},
|
||||||
|
{'text': '白锦程', 'type': '人名'},
|
||||||
|
{'text': '李强', 'type': '人名'}, # Duplicate
|
||||||
|
{'text': '张韶涵', 'type': '人名'}, # Duplicate
|
||||||
|
]
|
||||||
|
|
||||||
|
linkage = {
|
||||||
|
'entity_groups': [
|
||||||
|
{
|
||||||
|
'group_id': 'g1',
|
||||||
|
'group_type': '人名',
|
||||||
|
'entities': [
|
||||||
|
{'text': '李强', 'type': '人名', 'is_primary': True},
|
||||||
|
{'text': '张韶涵', 'type': '人名', 'is_primary': True},
|
||||||
|
{'text': '张若宇', 'type': '人名', 'is_primary': True},
|
||||||
|
{'text': '白锦程', 'type': '人名', 'is_primary': True},
|
||||||
|
]
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
|
||||||
|
mapping = processor._generate_masked_mapping(unique_entities, linkage)
|
||||||
|
|
||||||
|
# Verify the mapping results
|
||||||
|
assert mapping['李强'] == '李Q'
|
||||||
|
assert mapping['张韶涵'] == '张SH'
|
||||||
|
assert mapping['张若宇'] == '张RY'
|
||||||
|
assert mapping['白锦程'] == '白JC'
|
||||||
|
|
||||||
|
# Check that duplicates are handled correctly
|
||||||
|
# The second occurrence should be numbered
|
||||||
|
assert '李Q2' in mapping.values() or '张SH2' in mapping.values()
|
||||||
|
|
||||||
|
|
||||||
|
def test_lawyer_and_judge_names():
|
||||||
|
"""Test that lawyer and judge names follow the same Chinese name rules"""
|
||||||
|
processor = NerProcessor()
|
||||||
|
|
||||||
|
# Test lawyer and judge names
|
||||||
|
test_entities = [
|
||||||
|
{'text': '王律师', 'type': '律师姓名'},
|
||||||
|
{'text': '李法官', 'type': '审判人员姓名'},
|
||||||
|
{'text': '张检察官', 'type': '检察官姓名'},
|
||||||
|
]
|
||||||
|
|
||||||
|
linkage = {
|
||||||
|
'entity_groups': [
|
||||||
|
{
|
||||||
|
'group_id': 'g1',
|
||||||
|
'group_type': '律师姓名',
|
||||||
|
'entities': [{'text': '王律师', 'type': '律师姓名', 'is_primary': True}]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
'group_id': 'g2',
|
||||||
|
'group_type': '审判人员姓名',
|
||||||
|
'entities': [{'text': '李法官', 'type': '审判人员姓名', 'is_primary': True}]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
'group_id': 'g3',
|
||||||
|
'group_type': '检察官姓名',
|
||||||
|
'entities': [{'text': '张检察官', 'type': '检察官姓名', 'is_primary': True}]
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
|
||||||
|
mapping = processor._generate_masked_mapping(test_entities, linkage)
|
||||||
|
|
||||||
|
# These should follow the same Chinese name masking rules
|
||||||
|
assert mapping['王律师'] == '王L'
|
||||||
|
assert mapping['李法官'] == '李F'
|
||||||
|
assert mapping['张检察官'] == '张JC'
|
||||||
Loading…
Reference in New Issue