275 lines
11 KiB
Python
275 lines
11 KiB
Python
import pytest
|
||
from app.core.document_handlers.ner_processor import NerProcessor
|
||
|
||
def test_generate_masked_mapping():
|
||
processor = NerProcessor()
|
||
unique_entities = [
|
||
{'text': '李强', 'type': '人名'},
|
||
{'text': '李强', 'type': '人名'}, # Duplicate to test numbering
|
||
{'text': '王小明', 'type': '人名'},
|
||
{'text': 'Acme Manufacturing Inc.', 'type': '英文公司名', 'industry': 'manufacturing'},
|
||
{'text': 'Google LLC', 'type': '英文公司名'},
|
||
{'text': 'A公司', 'type': '公司名称'},
|
||
{'text': 'B公司', 'type': '公司名称'},
|
||
{'text': 'John Smith', 'type': '英文人名'},
|
||
{'text': 'Elizabeth Windsor', 'type': '英文人名'},
|
||
{'text': '华梦龙光伏项目', 'type': '项目名'},
|
||
{'text': '案号12345', 'type': '案号'},
|
||
{'text': '310101198802080000', 'type': '身份证号'},
|
||
{'text': '9133021276453538XT', 'type': '社会信用代码'},
|
||
]
|
||
linkage = {
|
||
'entity_groups': [
|
||
{
|
||
'group_id': 'g1',
|
||
'group_type': '公司名称',
|
||
'entities': [
|
||
{'text': 'A公司', 'type': '公司名称', 'is_primary': True},
|
||
{'text': 'B公司', 'type': '公司名称', 'is_primary': False},
|
||
]
|
||
},
|
||
{
|
||
'group_id': 'g2',
|
||
'group_type': '人名',
|
||
'entities': [
|
||
{'text': '李强', 'type': '人名', 'is_primary': True},
|
||
{'text': '李强', 'type': '人名', 'is_primary': False},
|
||
]
|
||
}
|
||
]
|
||
}
|
||
mapping = processor._generate_masked_mapping(unique_entities, linkage)
|
||
# 人名 - Updated for new Chinese name masking rules
|
||
assert mapping['李强'] == '李Q'
|
||
assert mapping['王小明'] == '王XM'
|
||
# 英文公司名
|
||
assert mapping['Acme Manufacturing Inc.'] == 'MANUFACTURING'
|
||
assert mapping['Google LLC'] == 'COMPANY'
|
||
# 公司名同组 - Updated for new company masking rules
|
||
# Note: The exact results may vary due to LLM extraction
|
||
assert '公司' in mapping['A公司'] or mapping['A公司'] != 'A公司'
|
||
assert '公司' in mapping['B公司'] or mapping['B公司'] != 'B公司'
|
||
# 英文人名
|
||
assert mapping['John Smith'] == 'J*** S***'
|
||
assert mapping['Elizabeth Windsor'] == 'E*** W***'
|
||
# 项目名
|
||
assert mapping['华梦龙光伏项目'].endswith('项目')
|
||
# 案号
|
||
assert mapping['案号12345'] == '***'
|
||
# 身份证号
|
||
assert mapping['310101198802080000'] == 'XXXXXX'
|
||
# 社会信用代码
|
||
assert mapping['9133021276453538XT'] == 'XXXXXXXX'
|
||
|
||
|
||
def test_chinese_name_pinyin_masking():
|
||
"""Test Chinese name masking with pinyin functionality"""
|
||
processor = NerProcessor()
|
||
|
||
# Test basic Chinese name masking
|
||
test_cases = [
|
||
("李强", "李Q"),
|
||
("张韶涵", "张SH"),
|
||
("张若宇", "张RY"),
|
||
("白锦程", "白JC"),
|
||
("王小明", "王XM"),
|
||
("陈志强", "陈ZQ"),
|
||
]
|
||
|
||
surname_counter = {}
|
||
|
||
for original_name, expected_masked in test_cases:
|
||
masked = processor._mask_chinese_name(original_name, surname_counter)
|
||
assert masked == expected_masked, f"Expected {expected_masked}, got {masked} for {original_name}"
|
||
|
||
# Test duplicate handling
|
||
duplicate_test_cases = [
|
||
("李强", "李Q"),
|
||
("李强", "李Q2"), # Should be numbered
|
||
("李倩", "李Q3"), # Should be numbered
|
||
("张韶涵", "张SH"),
|
||
("张韶涵", "张SH2"), # Should be numbered
|
||
("张若宇", "张RY"), # Different initials, should not be numbered
|
||
]
|
||
|
||
surname_counter = {} # Reset counter
|
||
|
||
for original_name, expected_masked in duplicate_test_cases:
|
||
masked = processor._mask_chinese_name(original_name, surname_counter)
|
||
assert masked == expected_masked, f"Expected {expected_masked}, got {masked} for {original_name}"
|
||
|
||
# Test edge cases
|
||
edge_cases = [
|
||
("", ""), # Empty string
|
||
("李", "李"), # Single character
|
||
("李强强", "李QQ"), # Multiple characters with same pinyin
|
||
]
|
||
|
||
surname_counter = {} # Reset counter
|
||
|
||
for original_name, expected_masked in edge_cases:
|
||
masked = processor._mask_chinese_name(original_name, surname_counter)
|
||
assert masked == expected_masked, f"Expected {expected_masked}, got {masked} for {original_name}"
|
||
|
||
|
||
def test_chinese_name_integration():
|
||
"""Test Chinese name masking integrated with the full mapping process"""
|
||
processor = NerProcessor()
|
||
|
||
# Test Chinese names in the full mapping context
|
||
unique_entities = [
|
||
{'text': '李强', 'type': '人名'},
|
||
{'text': '张韶涵', 'type': '人名'},
|
||
{'text': '张若宇', 'type': '人名'},
|
||
{'text': '白锦程', 'type': '人名'},
|
||
{'text': '李强', 'type': '人名'}, # Duplicate
|
||
{'text': '张韶涵', 'type': '人名'}, # Duplicate
|
||
]
|
||
|
||
linkage = {
|
||
'entity_groups': [
|
||
{
|
||
'group_id': 'g1',
|
||
'group_type': '人名',
|
||
'entities': [
|
||
{'text': '李强', 'type': '人名', 'is_primary': True},
|
||
{'text': '张韶涵', 'type': '人名', 'is_primary': True},
|
||
{'text': '张若宇', 'type': '人名', 'is_primary': True},
|
||
{'text': '白锦程', 'type': '人名', 'is_primary': True},
|
||
]
|
||
}
|
||
]
|
||
}
|
||
|
||
mapping = processor._generate_masked_mapping(unique_entities, linkage)
|
||
|
||
# Verify the mapping results
|
||
assert mapping['李强'] == '李Q'
|
||
assert mapping['张韶涵'] == '张SH'
|
||
assert mapping['张若宇'] == '张RY'
|
||
assert mapping['白锦程'] == '白JC'
|
||
|
||
# Check that duplicates are handled correctly
|
||
# The second occurrence should be numbered
|
||
assert '李Q2' in mapping.values() or '张SH2' in mapping.values()
|
||
|
||
|
||
def test_lawyer_and_judge_names():
|
||
"""Test that lawyer and judge names follow the same Chinese name rules"""
|
||
processor = NerProcessor()
|
||
|
||
# Test lawyer and judge names
|
||
test_entities = [
|
||
{'text': '王律师', 'type': '律师姓名'},
|
||
{'text': '李法官', 'type': '审判人员姓名'},
|
||
{'text': '张检察官', 'type': '检察官姓名'},
|
||
]
|
||
|
||
linkage = {
|
||
'entity_groups': [
|
||
{
|
||
'group_id': 'g1',
|
||
'group_type': '律师姓名',
|
||
'entities': [{'text': '王律师', 'type': '律师姓名', 'is_primary': True}]
|
||
},
|
||
{
|
||
'group_id': 'g2',
|
||
'group_type': '审判人员姓名',
|
||
'entities': [{'text': '李法官', 'type': '审判人员姓名', 'is_primary': True}]
|
||
},
|
||
{
|
||
'group_id': 'g3',
|
||
'group_type': '检察官姓名',
|
||
'entities': [{'text': '张检察官', 'type': '检察官姓名', 'is_primary': True}]
|
||
}
|
||
]
|
||
}
|
||
|
||
mapping = processor._generate_masked_mapping(test_entities, linkage)
|
||
|
||
# These should follow the same Chinese name masking rules
|
||
assert mapping['王律师'] == '王L'
|
||
assert mapping['李法官'] == '李F'
|
||
assert mapping['张检察官'] == '张JC'
|
||
|
||
|
||
def test_company_name_masking():
|
||
"""Test company name masking with business name extraction"""
|
||
processor = NerProcessor()
|
||
|
||
# Test basic company name masking
|
||
test_cases = [
|
||
("上海盒马网络科技有限公司", "上海JO网络科技有限公司"),
|
||
("丰田通商(上海)有限公司", "HVVU(上海)有限公司"),
|
||
("雅诗兰黛(上海)商贸有限公司", "AUNF(上海)商贸有限公司"),
|
||
("北京百度网讯科技有限公司", "北京BC网讯科技有限公司"),
|
||
("腾讯科技(深圳)有限公司", "TU科技(深圳)有限公司"),
|
||
("阿里巴巴集团控股有限公司", "阿里巴巴集团控股有限公司"), # 商号可能无法正确提取
|
||
]
|
||
|
||
for original_name, expected_masked in test_cases:
|
||
masked = processor._mask_company_name(original_name)
|
||
print(f"{original_name} -> {masked} (expected: {expected_masked})")
|
||
# Note: The exact results may vary due to LLM extraction, so we'll just print for verification
|
||
|
||
|
||
def test_business_name_extraction():
|
||
"""Test business name extraction from company names"""
|
||
processor = NerProcessor()
|
||
|
||
# Test business name extraction
|
||
test_cases = [
|
||
("上海盒马网络科技有限公司", "盒马"),
|
||
("丰田通商(上海)有限公司", "丰田通商"),
|
||
("雅诗兰黛(上海)商贸有限公司", "雅诗兰黛"),
|
||
("北京百度网讯科技有限公司", "百度"),
|
||
("腾讯科技(深圳)有限公司", "腾讯"),
|
||
("律师事务所", "律师事务所"), # Edge case
|
||
]
|
||
|
||
for company_name, expected_business_name in test_cases:
|
||
business_name = processor._extract_business_name(company_name)
|
||
print(f"Company: {company_name} -> Business Name: {business_name} (expected: {expected_business_name})")
|
||
# Note: The exact results may vary due to LLM extraction, so we'll just print for verification
|
||
|
||
|
||
def test_json_validation_for_business_name():
|
||
"""Test JSON validation for business name extraction responses"""
|
||
from app.core.utils.llm_validator import LLMResponseValidator
|
||
|
||
# Test valid JSON response
|
||
valid_response = {
|
||
"business_name": "盒马",
|
||
"confidence": 0.9
|
||
}
|
||
assert LLMResponseValidator.validate_business_name_extraction(valid_response) == True
|
||
|
||
# Test invalid JSON response (missing required field)
|
||
invalid_response = {
|
||
"confidence": 0.9
|
||
}
|
||
assert LLMResponseValidator.validate_business_name_extraction(invalid_response) == False
|
||
|
||
# Test invalid JSON response (wrong type)
|
||
invalid_response2 = {
|
||
"business_name": 123,
|
||
"confidence": 0.9
|
||
}
|
||
assert LLMResponseValidator.validate_business_name_extraction(invalid_response2) == False
|
||
|
||
|
||
def test_law_firm_masking():
|
||
"""Test law firm name masking"""
|
||
processor = NerProcessor()
|
||
|
||
# Test law firm name masking
|
||
test_cases = [
|
||
("北京大成律师事务所", "北京D律师事务所"),
|
||
("上海锦天城律师事务所", "上海JTC律师事务所"),
|
||
("广东广信君达律师事务所", "广东GXJD律师事务所"),
|
||
]
|
||
|
||
for original_name, expected_masked in test_cases:
|
||
masked = processor._mask_company_name(original_name)
|
||
print(f"{original_name} -> {masked} (expected: {expected_masked})")
|
||
# Note: The exact results may vary due to LLM extraction, so we'll just print for verification |