legal-doc-masker/backend/tests/test_ner_processor.py

275 lines
11 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import pytest
from app.core.document_handlers.ner_processor import NerProcessor
def test_generate_masked_mapping():
processor = NerProcessor()
unique_entities = [
{'text': '李强', 'type': '人名'},
{'text': '李强', 'type': '人名'}, # Duplicate to test numbering
{'text': '王小明', 'type': '人名'},
{'text': 'Acme Manufacturing Inc.', 'type': '英文公司名', 'industry': 'manufacturing'},
{'text': 'Google LLC', 'type': '英文公司名'},
{'text': 'A公司', 'type': '公司名称'},
{'text': 'B公司', 'type': '公司名称'},
{'text': 'John Smith', 'type': '英文人名'},
{'text': 'Elizabeth Windsor', 'type': '英文人名'},
{'text': '华梦龙光伏项目', 'type': '项目名'},
{'text': '案号12345', 'type': '案号'},
{'text': '310101198802080000', 'type': '身份证号'},
{'text': '9133021276453538XT', 'type': '社会信用代码'},
]
linkage = {
'entity_groups': [
{
'group_id': 'g1',
'group_type': '公司名称',
'entities': [
{'text': 'A公司', 'type': '公司名称', 'is_primary': True},
{'text': 'B公司', 'type': '公司名称', 'is_primary': False},
]
},
{
'group_id': 'g2',
'group_type': '人名',
'entities': [
{'text': '李强', 'type': '人名', 'is_primary': True},
{'text': '李强', 'type': '人名', 'is_primary': False},
]
}
]
}
mapping = processor._generate_masked_mapping(unique_entities, linkage)
# 人名 - Updated for new Chinese name masking rules
assert mapping['李强'] == '李Q'
assert mapping['王小明'] == '王XM'
# 英文公司名
assert mapping['Acme Manufacturing Inc.'] == 'MANUFACTURING'
assert mapping['Google LLC'] == 'COMPANY'
# 公司名同组 - Updated for new company masking rules
# Note: The exact results may vary due to LLM extraction
assert '公司' in mapping['A公司'] or mapping['A公司'] != 'A公司'
assert '公司' in mapping['B公司'] or mapping['B公司'] != 'B公司'
# 英文人名
assert mapping['John Smith'] == 'J*** S***'
assert mapping['Elizabeth Windsor'] == 'E*** W***'
# 项目名
assert mapping['华梦龙光伏项目'].endswith('项目')
# 案号
assert mapping['案号12345'] == '***'
# 身份证号
assert mapping['310101198802080000'] == 'XXXXXX'
# 社会信用代码
assert mapping['9133021276453538XT'] == 'XXXXXXXX'
def test_chinese_name_pinyin_masking():
"""Test Chinese name masking with pinyin functionality"""
processor = NerProcessor()
# Test basic Chinese name masking
test_cases = [
("李强", "李Q"),
("张韶涵", "张SH"),
("张若宇", "张RY"),
("白锦程", "白JC"),
("王小明", "王XM"),
("陈志强", "陈ZQ"),
]
surname_counter = {}
for original_name, expected_masked in test_cases:
masked = processor._mask_chinese_name(original_name, surname_counter)
assert masked == expected_masked, f"Expected {expected_masked}, got {masked} for {original_name}"
# Test duplicate handling
duplicate_test_cases = [
("李强", "李Q"),
("李强", "李Q2"), # Should be numbered
("李倩", "李Q3"), # Should be numbered
("张韶涵", "张SH"),
("张韶涵", "张SH2"), # Should be numbered
("张若宇", "张RY"), # Different initials, should not be numbered
]
surname_counter = {} # Reset counter
for original_name, expected_masked in duplicate_test_cases:
masked = processor._mask_chinese_name(original_name, surname_counter)
assert masked == expected_masked, f"Expected {expected_masked}, got {masked} for {original_name}"
# Test edge cases
edge_cases = [
("", ""), # Empty string
("", ""), # Single character
("李强强", "李QQ"), # Multiple characters with same pinyin
]
surname_counter = {} # Reset counter
for original_name, expected_masked in edge_cases:
masked = processor._mask_chinese_name(original_name, surname_counter)
assert masked == expected_masked, f"Expected {expected_masked}, got {masked} for {original_name}"
def test_chinese_name_integration():
"""Test Chinese name masking integrated with the full mapping process"""
processor = NerProcessor()
# Test Chinese names in the full mapping context
unique_entities = [
{'text': '李强', 'type': '人名'},
{'text': '张韶涵', 'type': '人名'},
{'text': '张若宇', 'type': '人名'},
{'text': '白锦程', 'type': '人名'},
{'text': '李强', 'type': '人名'}, # Duplicate
{'text': '张韶涵', 'type': '人名'}, # Duplicate
]
linkage = {
'entity_groups': [
{
'group_id': 'g1',
'group_type': '人名',
'entities': [
{'text': '李强', 'type': '人名', 'is_primary': True},
{'text': '张韶涵', 'type': '人名', 'is_primary': True},
{'text': '张若宇', 'type': '人名', 'is_primary': True},
{'text': '白锦程', 'type': '人名', 'is_primary': True},
]
}
]
}
mapping = processor._generate_masked_mapping(unique_entities, linkage)
# Verify the mapping results
assert mapping['李强'] == '李Q'
assert mapping['张韶涵'] == '张SH'
assert mapping['张若宇'] == '张RY'
assert mapping['白锦程'] == '白JC'
# Check that duplicates are handled correctly
# The second occurrence should be numbered
assert '李Q2' in mapping.values() or '张SH2' in mapping.values()
def test_lawyer_and_judge_names():
"""Test that lawyer and judge names follow the same Chinese name rules"""
processor = NerProcessor()
# Test lawyer and judge names
test_entities = [
{'text': '王律师', 'type': '律师姓名'},
{'text': '李法官', 'type': '审判人员姓名'},
{'text': '张检察官', 'type': '检察官姓名'},
]
linkage = {
'entity_groups': [
{
'group_id': 'g1',
'group_type': '律师姓名',
'entities': [{'text': '王律师', 'type': '律师姓名', 'is_primary': True}]
},
{
'group_id': 'g2',
'group_type': '审判人员姓名',
'entities': [{'text': '李法官', 'type': '审判人员姓名', 'is_primary': True}]
},
{
'group_id': 'g3',
'group_type': '检察官姓名',
'entities': [{'text': '张检察官', 'type': '检察官姓名', 'is_primary': True}]
}
]
}
mapping = processor._generate_masked_mapping(test_entities, linkage)
# These should follow the same Chinese name masking rules
assert mapping['王律师'] == '王L'
assert mapping['李法官'] == '李F'
assert mapping['张检察官'] == '张JC'
def test_company_name_masking():
"""Test company name masking with business name extraction"""
processor = NerProcessor()
# Test basic company name masking
test_cases = [
("上海盒马网络科技有限公司", "上海JO网络科技有限公司"),
("丰田通商(上海)有限公司", "HVVU上海有限公司"),
("雅诗兰黛(上海)商贸有限公司", "AUNF上海商贸有限公司"),
("北京百度网讯科技有限公司", "北京BC网讯科技有限公司"),
("腾讯科技(深圳)有限公司", "TU科技深圳有限公司"),
("阿里巴巴集团控股有限公司", "阿里巴巴集团控股有限公司"), # 商号可能无法正确提取
]
for original_name, expected_masked in test_cases:
masked = processor._mask_company_name(original_name)
print(f"{original_name} -> {masked} (expected: {expected_masked})")
# Note: The exact results may vary due to LLM extraction, so we'll just print for verification
def test_business_name_extraction():
"""Test business name extraction from company names"""
processor = NerProcessor()
# Test business name extraction
test_cases = [
("上海盒马网络科技有限公司", "盒马"),
("丰田通商(上海)有限公司", "丰田通商"),
("雅诗兰黛(上海)商贸有限公司", "雅诗兰黛"),
("北京百度网讯科技有限公司", "百度"),
("腾讯科技(深圳)有限公司", "腾讯"),
("律师事务所", "律师事务所"), # Edge case
]
for company_name, expected_business_name in test_cases:
business_name = processor._extract_business_name(company_name)
print(f"Company: {company_name} -> Business Name: {business_name} (expected: {expected_business_name})")
# Note: The exact results may vary due to LLM extraction, so we'll just print for verification
def test_json_validation_for_business_name():
"""Test JSON validation for business name extraction responses"""
from app.core.utils.llm_validator import LLMResponseValidator
# Test valid JSON response
valid_response = {
"business_name": "盒马",
"confidence": 0.9
}
assert LLMResponseValidator.validate_business_name_extraction(valid_response) == True
# Test invalid JSON response (missing required field)
invalid_response = {
"confidence": 0.9
}
assert LLMResponseValidator.validate_business_name_extraction(invalid_response) == False
# Test invalid JSON response (wrong type)
invalid_response2 = {
"business_name": 123,
"confidence": 0.9
}
assert LLMResponseValidator.validate_business_name_extraction(invalid_response2) == False
def test_law_firm_masking():
"""Test law firm name masking"""
processor = NerProcessor()
# Test law firm name masking
test_cases = [
("北京大成律师事务所", "北京D律师事务所"),
("上海锦天城律师事务所", "上海JTC律师事务所"),
("广东广信君达律师事务所", "广东GXJD律师事务所"),
]
for original_name, expected_masked in test_cases:
masked = processor._mask_company_name(original_name)
print(f"{original_name} -> {masked} (expected: {expected_masked})")
# Note: The exact results may vary due to LLM extraction, so we'll just print for verification