feat: 调整ner的mask规则
This commit is contained in:
parent
1ba4f3cc02
commit
e8cb7b1a04
|
|
@ -96,56 +96,136 @@ class NerProcessor:
|
|||
logger.info(f"Merged {len(unique_entities)} unique entities")
|
||||
return unique_entities
|
||||
|
||||
def _generate_masked_mapping(self, unique_entities: list[Dict[str, str]]) -> Dict[str, str]:
|
||||
def _generate_masked_mapping(self, unique_entities: list[Dict[str, str]], linkage: Dict[str, Any]) -> Dict[str, str]:
|
||||
"""
|
||||
结合 linkage 信息,按实体分组映射同一脱敏名,并实现如下规则:
|
||||
1. 人名/简称:保留姓,名变为某,同姓编号;
|
||||
2. 公司名:同组公司名映射为大写字母公司(A公司、B公司...);
|
||||
3. 英文人名:每个单词首字母+***;
|
||||
4. 英文公司名:替换为所属行业名称,英文大写(如无行业信息,默认 COMPANY);
|
||||
5. 项目名:项目名称变为小写英文字母(如 a项目、b项目...);
|
||||
6. 案号:只替换案号中的数字部分为***,保留前后结构和“号”字,支持中间有空格;
|
||||
7. 身份证号:6位X;
|
||||
8. 社会信用代码:8位X;
|
||||
9. 地址:保留区级及以上行政区划,去除详细位置;
|
||||
10. 其他类型按原有逻辑。
|
||||
"""
|
||||
import re
|
||||
entity_mapping = {}
|
||||
used_masked_names = set()
|
||||
|
||||
group_mask_map = {}
|
||||
surname_counter = {}
|
||||
company_letter = ord('A')
|
||||
project_letter = ord('a')
|
||||
# 优先区县级单位,后市、省等
|
||||
admin_keywords = [
|
||||
'市辖区', '自治县', '自治旗', '林区', '区', '县', '旗', '州', '盟', '地区', '自治州',
|
||||
'市', '省', '自治区', '特别行政区'
|
||||
]
|
||||
admin_pattern = r"^(.*?(?:" + '|'.join(admin_keywords) + r"))"
|
||||
for group in linkage.get('entity_groups', []):
|
||||
group_type = group.get('group_type', '')
|
||||
entities = group.get('entities', [])
|
||||
if '公司' in group_type or 'Company' in group_type:
|
||||
masked = chr(company_letter) + '公司'
|
||||
company_letter += 1
|
||||
for entity in entities:
|
||||
group_mask_map[entity['text']] = masked
|
||||
elif '人名' in group_type:
|
||||
surname_local_counter = {}
|
||||
for entity in entities:
|
||||
name = entity['text']
|
||||
if not name:
|
||||
continue
|
||||
surname = name[0]
|
||||
surname_local_counter.setdefault(surname, 0)
|
||||
surname_local_counter[surname] += 1
|
||||
if surname_local_counter[surname] == 1:
|
||||
masked = f"{surname}某"
|
||||
else:
|
||||
masked = f"{surname}某{surname_local_counter[surname]}"
|
||||
group_mask_map[name] = masked
|
||||
elif '英文人名' in group_type:
|
||||
for entity in entities:
|
||||
name = entity['text']
|
||||
if not name:
|
||||
continue
|
||||
masked = ' '.join([n[0] + '***' if n else '' for n in name.split()])
|
||||
group_mask_map[name] = masked
|
||||
for entity in unique_entities:
|
||||
original_text = entity['text'].strip()
|
||||
text = entity['text']
|
||||
entity_type = entity.get('type', '')
|
||||
|
||||
if '人名' in entity_type or '英文人名' in entity_type:
|
||||
base_name = '某'
|
||||
masked_name = base_name
|
||||
counter = 1
|
||||
|
||||
while masked_name in used_masked_names:
|
||||
if counter <= 10:
|
||||
suffixes = ['甲', '乙', '丙', '丁', '戊', '己', '庚', '辛', '壬', '癸']
|
||||
masked_name = base_name + suffixes[counter - 1]
|
||||
if text in group_mask_map:
|
||||
entity_mapping[text] = group_mask_map[text]
|
||||
used_masked_names.add(group_mask_map[text])
|
||||
elif '英文公司名' in entity_type or 'English Company' in entity_type:
|
||||
industry = entity.get('industry', 'COMPANY')
|
||||
masked = industry.upper()
|
||||
entity_mapping[text] = masked
|
||||
used_masked_names.add(masked)
|
||||
elif '项目名' in entity_type:
|
||||
masked = chr(project_letter) + '项目'
|
||||
project_letter += 1
|
||||
entity_mapping[text] = masked
|
||||
used_masked_names.add(masked)
|
||||
elif '案号' in entity_type:
|
||||
masked = re.sub(r'(\d[\d\s]*)(号)', r'***\2', text)
|
||||
entity_mapping[text] = masked
|
||||
used_masked_names.add(masked)
|
||||
elif '身份证号' in entity_type:
|
||||
masked = 'X' * 6
|
||||
entity_mapping[text] = masked
|
||||
used_masked_names.add(masked)
|
||||
elif '社会信用代码' in entity_type:
|
||||
masked = 'X' * 8
|
||||
entity_mapping[text] = masked
|
||||
used_masked_names.add(masked)
|
||||
elif '地址' in entity_type:
|
||||
# 保留区级及以上行政区划,去除详细位置
|
||||
match = re.match(admin_pattern, text)
|
||||
if match:
|
||||
masked = match.group(1)
|
||||
else:
|
||||
masked = text # fallback
|
||||
entity_mapping[text] = masked
|
||||
used_masked_names.add(masked)
|
||||
elif '人名' in entity_type:
|
||||
name = text
|
||||
if not name:
|
||||
masked = '某'
|
||||
else:
|
||||
surname = name[0]
|
||||
surname_counter.setdefault(surname, 0)
|
||||
surname_counter[surname] += 1
|
||||
if surname_counter[surname] == 1:
|
||||
masked = f"{surname}某"
|
||||
else:
|
||||
masked_name = f"{base_name}{counter}"
|
||||
counter += 1
|
||||
|
||||
masked = f"{surname}某{surname_counter[surname]}"
|
||||
entity_mapping[text] = masked
|
||||
used_masked_names.add(masked)
|
||||
elif '公司' in entity_type or 'Company' in entity_type:
|
||||
base_name = '某公司'
|
||||
masked_name = base_name
|
||||
counter = 1
|
||||
|
||||
while masked_name in used_masked_names:
|
||||
if counter <= 10:
|
||||
suffixes = ['甲', '乙', '丙', '丁', '戊', '己', '庚', '辛', '壬', '癸']
|
||||
masked_name = base_name + suffixes[counter - 1]
|
||||
else:
|
||||
masked_name = f"{base_name}{counter}"
|
||||
counter += 1
|
||||
masked = chr(company_letter) + '公司'
|
||||
company_letter += 1
|
||||
entity_mapping[text] = masked
|
||||
used_masked_names.add(masked)
|
||||
elif '英文人名' in entity_type:
|
||||
name = text
|
||||
masked = ' '.join([n[0] + '***' if n else '' for n in name.split()])
|
||||
entity_mapping[text] = masked
|
||||
used_masked_names.add(masked)
|
||||
else:
|
||||
base_name = '某'
|
||||
masked_name = base_name
|
||||
masked = base_name
|
||||
counter = 1
|
||||
|
||||
while masked_name in used_masked_names:
|
||||
while masked in used_masked_names:
|
||||
if counter <= 10:
|
||||
suffixes = ['甲', '乙', '丙', '丁', '戊', '己', '庚', '辛', '壬', '癸']
|
||||
masked_name = base_name + suffixes[counter - 1]
|
||||
masked = base_name + suffixes[counter - 1]
|
||||
else:
|
||||
masked_name = f"{base_name}{counter}"
|
||||
masked = f"{base_name}{counter}"
|
||||
counter += 1
|
||||
|
||||
entity_mapping[original_text] = masked_name
|
||||
used_masked_names.add(masked_name)
|
||||
|
||||
logger.info(f"Generated masked mapping for {len(entity_mapping)} entities")
|
||||
entity_mapping[text] = masked
|
||||
used_masked_names.add(masked)
|
||||
return entity_mapping
|
||||
|
||||
def _validate_linkage_format(self, linkage: Dict[str, Any]) -> bool:
|
||||
|
|
@ -192,34 +272,10 @@ class NerProcessor:
|
|||
return {"entity_groups": []}
|
||||
|
||||
def _apply_entity_linkage_to_mapping(self, entity_mapping: Dict[str, str], entity_linkage: Dict[str, Any]) -> Dict[str, str]:
|
||||
updated_mapping = entity_mapping.copy()
|
||||
|
||||
for group in entity_linkage.get('entity_groups', []):
|
||||
group_entities = group.get('entities', [])
|
||||
if not group_entities:
|
||||
continue
|
||||
|
||||
primary_entity = None
|
||||
for entity in group_entities:
|
||||
if entity.get('is_primary', False):
|
||||
primary_entity = entity
|
||||
break
|
||||
|
||||
if not primary_entity and group_entities:
|
||||
primary_entity = group_entities[0]
|
||||
|
||||
if primary_entity:
|
||||
primary_text = primary_entity['text']
|
||||
primary_masked = updated_mapping.get(primary_text)
|
||||
|
||||
if primary_masked:
|
||||
for entity in group_entities:
|
||||
entity_text = entity['text']
|
||||
if entity_text in updated_mapping:
|
||||
updated_mapping[entity_text] = primary_masked
|
||||
logger.info(f"Linked entity '{entity_text}' to '{primary_text}' with masked name '{primary_masked}'")
|
||||
|
||||
return updated_mapping
|
||||
"""
|
||||
linkage 已在 _generate_masked_mapping 中处理,此处直接返回 entity_mapping。
|
||||
"""
|
||||
return entity_mapping
|
||||
|
||||
def process(self, chunks: list[str]) -> Dict[str, str]:
|
||||
chunk_mappings = []
|
||||
|
|
@ -237,7 +293,10 @@ class NerProcessor:
|
|||
entity_linkage = self._create_entity_linkage(unique_entities)
|
||||
logger.info(f"Entity linkage: {entity_linkage}")
|
||||
|
||||
combined_mapping = self._generate_masked_mapping(unique_entities)
|
||||
# for quick test
|
||||
# unique_entities = [{'text': '郭东军', 'type': '人名'}, {'text': '王欢子', 'type': '人名'}, {'text': '北京丰复久信营销科技有限公司', 'type': '公司名称'}, {'text': '丰复久信公司', 'type': '公司名称简称'}, {'text': '中研智创区块链技术有限公司', 'type': '公司名称'}, {'text': '中研智才公司', 'type': '公司名称简称'}, {'text': '北京市海淀区北小马厂6 号1 号楼华天大厦1306 室', 'type': '地址'}, {'text': '天津市津南区双港镇工业园区优谷产业园5 号楼-1505', 'type': '地址'}, {'text': '服务合同', 'type': '项目名'}, {'text': '(2022)京 03 民终 3852 号', 'type': '案号'}, {'text': '(2020)京0105 民初69754 号', 'type': '案号'}, {'text': '李圣艳', 'type': '人名'}, {'text': '闫向东', 'type': '人名'}, {'text': '李敏', 'type': '人名'}, {'text': '布兰登·斯密特', 'type': '英文人名'}, {'text': '中研智创公司', 'type': '公司名称'}, {'text': '丰复久信', 'type': '公司名称简称'}, {'text': '中研智创', 'type': '公司名称简称'}, {'text': '上海市', 'type': '地址'}, {'text': '北京', 'type': '地址'}, {'text': '《计算机设备采购合同》', 'type': '项目名'}, {'text': '《服务合同书》', 'type': '项目名'}]
|
||||
# entity_linkage = {'entity_groups': [{'group_id': 'group_1', 'group_type': '公司名称', 'entities': [{'text': '北京丰复久信营销科技有限公司', 'type': '公司名称', 'is_primary': True}, {'text': '丰复久信公司', 'type': '公司名称简称', 'is_primary': False}, {'text': '丰复久信', 'type': '公司名称简称', 'is_primary': False}]}, {'group_id': 'group_2', 'group_type': '公司名称', 'entities': [{'text': '中研智创区块链技术有限公司', 'type': '公司名称', 'is_primary': True}, {'text': '中研智创公司', 'type': '公司名称简称', 'is_primary': False}, {'text': '中研智创', 'type': '公司名称简称', 'is_primary': False}]}]}
|
||||
combined_mapping = self._generate_masked_mapping(unique_entities, entity_linkage)
|
||||
logger.info(f"Combined mapping: {combined_mapping}")
|
||||
|
||||
final_mapping = self._apply_entity_linkage_to_mapping(combined_mapping, entity_linkage)
|
||||
|
|
|
|||
Loading…
Reference in New Issue