diff --git a/backend/app/core/document_handlers/ner_processor.py b/backend/app/core/document_handlers/ner_processor.py index eb9f365..167cd31 100644 --- a/backend/app/core/document_handlers/ner_processor.py +++ b/backend/app/core/document_handlers/ner_processor.py @@ -8,6 +8,7 @@ from ..utils.json_extractor import LLMJsonExtractor from ..utils.llm_validator import LLMResponseValidator import re from .regs.entity_regex import extract_id_number_entities, extract_social_credit_code_entities +from pypinyin import pinyin, Style logger = logging.getLogger(__name__) @@ -19,6 +20,41 @@ class NerProcessor: def _validate_mapping_format(self, mapping: Dict[str, Any]) -> bool: return LLMResponseValidator.validate_entity_extraction(mapping) + def _mask_chinese_name(self, name: str, surname_counter: Dict[str, Dict[str, int]]) -> str: + """ + 处理中文姓名脱敏: + 保留姓,名变为大写首字母; + 同姓名同首字母者按1、2依次编号 + """ + if not name or len(name) < 2: + return name + + surname = name[0] + given_name = name[1:] + + # 获取名的拼音首字母 + try: + pinyin_list = pinyin(given_name, style=Style.NORMAL) + initials = ''.join([p[0][0].upper() for p in pinyin_list if p and p[0]]) + except Exception as e: + logger.warning(f"Failed to get pinyin for {given_name}: {e}") + # 如果拼音转换失败,使用原字符 + initials = given_name + + # 初始化姓氏计数器 + if surname not in surname_counter: + surname_counter[surname] = {} + + # 检查是否有相同姓氏和首字母的组合 + if initials in surname_counter[surname]: + surname_counter[surname][initials] += 1 + masked_name = f"{surname}{initials}{surname_counter[surname][initials]}" + else: + surname_counter[surname][initials] = 1 + masked_name = f"{surname}{initials}" + + return masked_name + def _process_entity_type(self, chunk: str, prompt_func, entity_type: str) -> Dict[str, str]: for attempt in range(self.max_retries): try: @@ -99,22 +135,23 @@ class NerProcessor: def _generate_masked_mapping(self, unique_entities: list[Dict[str, str]], linkage: Dict[str, Any]) -> Dict[str, str]: """ 结合 linkage 信息,按实体分组映射同一脱敏名,并实现如下规则: - 1. 人名/简称:保留姓,名变为某,同姓编号; - 2. 公司名:同组公司名映射为大写字母公司(A公司、B公司...); - 3. 英文人名:每个单词首字母+***; - 4. 英文公司名:替换为所属行业名称,英文大写(如无行业信息,默认 COMPANY); - 5. 项目名:项目名称变为小写英文字母(如 a项目、b项目...); - 6. 案号:只替换案号中的数字部分为***,保留前后结构和“号”字,支持中间有空格; - 7. 身份证号:6位X; - 8. 社会信用代码:8位X; - 9. 地址:保留区级及以上行政区划,去除详细位置; - 10. 其他类型按原有逻辑。 + 1. 中文人名:保留姓,名变为大写首字母,同姓名同首字母者按1、2依次编号(如:李强->李Q,张韶涵->张SH,张若宇->张RY,白锦程->白JC); + 2. 律师姓名、审判人员姓名:同上中文人名规则; + 3. 公司名:同组公司名映射为大写字母公司(A公司、B公司...); + 4. 英文人名:每个单词首字母+***; + 5. 英文公司名:替换为所属行业名称,英文大写(如无行业信息,默认 COMPANY); + 6. 项目名:项目名称变为小写英文字母(如 a项目、b项目...); + 7. 案号:只替换案号中的数字部分为***,保留前后结构和"号"字,支持中间有空格; + 8. 身份证号:6位X; + 9. 社会信用代码:8位X; + 10. 地址:保留区级及以上行政区划,去除详细位置; + 11. 其他类型按原有逻辑。 """ import re entity_mapping = {} used_masked_names = set() group_mask_map = {} - surname_counter = {} + surname_counter = {} # 用于中文姓名脱敏的计数器 company_letter = ord('A') project_letter = ord('a') # 优先区县级单位,后市、省等 @@ -132,18 +169,12 @@ class NerProcessor: for entity in entities: group_mask_map[entity['text']] = masked elif '人名' in group_type: - surname_local_counter = {} for entity in entities: name = entity['text'] if not name: continue - surname = name[0] - surname_local_counter.setdefault(surname, 0) - surname_local_counter[surname] += 1 - if surname_local_counter[surname] == 1: - masked = f"{surname}某" - else: - masked = f"{surname}某{surname_local_counter[surname]}" + # 使用新的中文姓名脱敏方法 + masked = self._mask_chinese_name(name, surname_counter) group_mask_map[name] = masked elif '英文人名' in group_type: for entity in entities: @@ -194,13 +225,8 @@ class NerProcessor: if not name: masked = '某' else: - surname = name[0] - surname_counter.setdefault(surname, 0) - surname_counter[surname] += 1 - if surname_counter[surname] == 1: - masked = f"{surname}某" - else: - masked = f"{surname}某{surname_counter[surname]}" + # 使用新的中文姓名脱敏方法 + masked = self._mask_chinese_name(name, surname_counter) entity_mapping[text] = masked used_masked_names.add(masked) elif '公司' in entity_type or 'Company' in entity_type: diff --git a/backend/requirements.txt b/backend/requirements.txt index 515d6be..3c4e762 100644 --- a/backend/requirements.txt +++ b/backend/requirements.txt @@ -29,4 +29,7 @@ python-docx>=0.8.11 PyPDF2>=3.0.0 pandas>=2.0.0 # magic-pdf[full] -jsonschema>=4.20.0 \ No newline at end of file +jsonschema>=4.20.0 + +# Chinese text processing +pypinyin>=0.50.0 \ No newline at end of file diff --git a/backend/tests/test_ner_processor.py b/backend/tests/test_ner_processor.py index 74cbeb5..e7ccc0b 100644 --- a/backend/tests/test_ner_processor.py +++ b/backend/tests/test_ner_processor.py @@ -4,9 +4,9 @@ from app.core.document_handlers.ner_processor import NerProcessor def test_generate_masked_mapping(): processor = NerProcessor() unique_entities = [ - {'text': '李雷', 'type': '人名'}, - {'text': '李明', 'type': '人名'}, - {'text': '王强', 'type': '人名'}, + {'text': '李强', 'type': '人名'}, + {'text': '李强', 'type': '人名'}, # Duplicate to test numbering + {'text': '王小明', 'type': '人名'}, {'text': 'Acme Manufacturing Inc.', 'type': '英文公司名', 'industry': 'manufacturing'}, {'text': 'Google LLC', 'type': '英文公司名'}, {'text': 'A公司', 'type': '公司名称'}, @@ -32,17 +32,16 @@ def test_generate_masked_mapping(): 'group_id': 'g2', 'group_type': '人名', 'entities': [ - {'text': '李雷', 'type': '人名', 'is_primary': True}, - {'text': '李明', 'type': '人名', 'is_primary': False}, + {'text': '李强', 'type': '人名', 'is_primary': True}, + {'text': '李强', 'type': '人名', 'is_primary': False}, ] } ] } mapping = processor._generate_masked_mapping(unique_entities, linkage) - # 人名 - assert mapping['李雷'].startswith('李某') - assert mapping['李明'].startswith('李某') - assert mapping['王强'].startswith('王某') + # 人名 - Updated for new Chinese name masking rules + assert mapping['李强'] == '李Q' + assert mapping['王小明'] == '王XM' # 英文公司名 assert mapping['Acme Manufacturing Inc.'] == 'MANUFACTURING' assert mapping['Google LLC'] == 'COMPANY' @@ -59,4 +58,135 @@ def test_generate_masked_mapping(): # 身份证号 assert mapping['310101198802080000'] == 'XXXXXX' # 社会信用代码 - assert mapping['9133021276453538XT'] == 'XXXXXXXX' \ No newline at end of file + assert mapping['9133021276453538XT'] == 'XXXXXXXX' + + +def test_chinese_name_pinyin_masking(): + """Test Chinese name masking with pinyin functionality""" + processor = NerProcessor() + + # Test basic Chinese name masking + test_cases = [ + ("李强", "李Q"), + ("张韶涵", "张SH"), + ("张若宇", "张RY"), + ("白锦程", "白JC"), + ("王小明", "王XM"), + ("陈志强", "陈ZQ"), + ] + + surname_counter = {} + + for original_name, expected_masked in test_cases: + masked = processor._mask_chinese_name(original_name, surname_counter) + assert masked == expected_masked, f"Expected {expected_masked}, got {masked} for {original_name}" + + # Test duplicate handling + duplicate_test_cases = [ + ("李强", "李Q"), + ("李强", "李Q2"), # Should be numbered + ("李倩", "李Q3"), # Should be numbered + ("张韶涵", "张SH"), + ("张韶涵", "张SH2"), # Should be numbered + ("张若宇", "张RY"), # Different initials, should not be numbered + ] + + surname_counter = {} # Reset counter + + for original_name, expected_masked in duplicate_test_cases: + masked = processor._mask_chinese_name(original_name, surname_counter) + assert masked == expected_masked, f"Expected {expected_masked}, got {masked} for {original_name}" + + # Test edge cases + edge_cases = [ + ("", ""), # Empty string + ("李", "李"), # Single character + ("李强强", "李QQ"), # Multiple characters with same pinyin + ] + + surname_counter = {} # Reset counter + + for original_name, expected_masked in edge_cases: + masked = processor._mask_chinese_name(original_name, surname_counter) + assert masked == expected_masked, f"Expected {expected_masked}, got {masked} for {original_name}" + + +def test_chinese_name_integration(): + """Test Chinese name masking integrated with the full mapping process""" + processor = NerProcessor() + + # Test Chinese names in the full mapping context + unique_entities = [ + {'text': '李强', 'type': '人名'}, + {'text': '张韶涵', 'type': '人名'}, + {'text': '张若宇', 'type': '人名'}, + {'text': '白锦程', 'type': '人名'}, + {'text': '李强', 'type': '人名'}, # Duplicate + {'text': '张韶涵', 'type': '人名'}, # Duplicate + ] + + linkage = { + 'entity_groups': [ + { + 'group_id': 'g1', + 'group_type': '人名', + 'entities': [ + {'text': '李强', 'type': '人名', 'is_primary': True}, + {'text': '张韶涵', 'type': '人名', 'is_primary': True}, + {'text': '张若宇', 'type': '人名', 'is_primary': True}, + {'text': '白锦程', 'type': '人名', 'is_primary': True}, + ] + } + ] + } + + mapping = processor._generate_masked_mapping(unique_entities, linkage) + + # Verify the mapping results + assert mapping['李强'] == '李Q' + assert mapping['张韶涵'] == '张SH' + assert mapping['张若宇'] == '张RY' + assert mapping['白锦程'] == '白JC' + + # Check that duplicates are handled correctly + # The second occurrence should be numbered + assert '李Q2' in mapping.values() or '张SH2' in mapping.values() + + +def test_lawyer_and_judge_names(): + """Test that lawyer and judge names follow the same Chinese name rules""" + processor = NerProcessor() + + # Test lawyer and judge names + test_entities = [ + {'text': '王律师', 'type': '律师姓名'}, + {'text': '李法官', 'type': '审判人员姓名'}, + {'text': '张检察官', 'type': '检察官姓名'}, + ] + + linkage = { + 'entity_groups': [ + { + 'group_id': 'g1', + 'group_type': '律师姓名', + 'entities': [{'text': '王律师', 'type': '律师姓名', 'is_primary': True}] + }, + { + 'group_id': 'g2', + 'group_type': '审判人员姓名', + 'entities': [{'text': '李法官', 'type': '审判人员姓名', 'is_primary': True}] + }, + { + 'group_id': 'g3', + 'group_type': '检察官姓名', + 'entities': [{'text': '张检察官', 'type': '检察官姓名', 'is_primary': True}] + } + ] + } + + mapping = processor._generate_masked_mapping(test_entities, linkage) + + # These should follow the same Chinese name masking rules + assert mapping['王律师'] == '王L' + assert mapping['李法官'] == '李F' + assert mapping['张检察官'] == '张JC' \ No newline at end of file