import pytest from app.core.document_handlers.ner_processor import NerProcessor def test_generate_masked_mapping(): processor = NerProcessor() unique_entities = [ {'text': '李强', 'type': '人名'}, {'text': '李强', 'type': '人名'}, # Duplicate to test numbering {'text': '王小明', 'type': '人名'}, {'text': 'Acme Manufacturing Inc.', 'type': '英文公司名', 'industry': 'manufacturing'}, {'text': 'Google LLC', 'type': '英文公司名'}, {'text': 'A公司', 'type': '公司名称'}, {'text': 'B公司', 'type': '公司名称'}, {'text': 'John Smith', 'type': '英文人名'}, {'text': 'Elizabeth Windsor', 'type': '英文人名'}, {'text': '华梦龙光伏项目', 'type': '项目名'}, {'text': '案号12345', 'type': '案号'}, {'text': '310101198802080000', 'type': '身份证号'}, {'text': '9133021276453538XT', 'type': '社会信用代码'}, ] linkage = { 'entity_groups': [ { 'group_id': 'g1', 'group_type': '公司名称', 'entities': [ {'text': 'A公司', 'type': '公司名称', 'is_primary': True}, {'text': 'B公司', 'type': '公司名称', 'is_primary': False}, ] }, { 'group_id': 'g2', 'group_type': '人名', 'entities': [ {'text': '李强', 'type': '人名', 'is_primary': True}, {'text': '李强', 'type': '人名', 'is_primary': False}, ] } ] } mapping = processor._generate_masked_mapping(unique_entities, linkage) # 人名 - Updated for new Chinese name masking rules assert mapping['李强'] == '李Q' assert mapping['王小明'] == '王XM' # 英文公司名 assert mapping['Acme Manufacturing Inc.'] == 'MANUFACTURING' assert mapping['Google LLC'] == 'COMPANY' # 公司名同组 - Updated for new company masking rules # Note: The exact results may vary due to LLM extraction assert '公司' in mapping['A公司'] or mapping['A公司'] != 'A公司' assert '公司' in mapping['B公司'] or mapping['B公司'] != 'B公司' # 英文人名 assert mapping['John Smith'] == 'J*** S***' assert mapping['Elizabeth Windsor'] == 'E*** W***' # 项目名 assert mapping['华梦龙光伏项目'].endswith('项目') # 案号 assert mapping['案号12345'] == '***' # 身份证号 assert mapping['310101198802080000'] == 'XXXXXX' # 社会信用代码 assert mapping['9133021276453538XT'] == 'XXXXXXXX' def test_chinese_name_pinyin_masking(): """Test Chinese name masking with pinyin functionality""" processor = NerProcessor() # Test basic Chinese name masking test_cases = [ ("李强", "李Q"), ("张韶涵", "张SH"), ("张若宇", "张RY"), ("白锦程", "白JC"), ("王小明", "王XM"), ("陈志强", "陈ZQ"), ] surname_counter = {} for original_name, expected_masked in test_cases: masked = processor._mask_chinese_name(original_name, surname_counter) assert masked == expected_masked, f"Expected {expected_masked}, got {masked} for {original_name}" # Test duplicate handling duplicate_test_cases = [ ("李强", "李Q"), ("李强", "李Q2"), # Should be numbered ("李倩", "李Q3"), # Should be numbered ("张韶涵", "张SH"), ("张韶涵", "张SH2"), # Should be numbered ("张若宇", "张RY"), # Different initials, should not be numbered ] surname_counter = {} # Reset counter for original_name, expected_masked in duplicate_test_cases: masked = processor._mask_chinese_name(original_name, surname_counter) assert masked == expected_masked, f"Expected {expected_masked}, got {masked} for {original_name}" # Test edge cases edge_cases = [ ("", ""), # Empty string ("李", "李"), # Single character ("李强强", "李QQ"), # Multiple characters with same pinyin ] surname_counter = {} # Reset counter for original_name, expected_masked in edge_cases: masked = processor._mask_chinese_name(original_name, surname_counter) assert masked == expected_masked, f"Expected {expected_masked}, got {masked} for {original_name}" def test_chinese_name_integration(): """Test Chinese name masking integrated with the full mapping process""" processor = NerProcessor() # Test Chinese names in the full mapping context unique_entities = [ {'text': '李强', 'type': '人名'}, {'text': '张韶涵', 'type': '人名'}, {'text': '张若宇', 'type': '人名'}, {'text': '白锦程', 'type': '人名'}, {'text': '李强', 'type': '人名'}, # Duplicate {'text': '张韶涵', 'type': '人名'}, # Duplicate ] linkage = { 'entity_groups': [ { 'group_id': 'g1', 'group_type': '人名', 'entities': [ {'text': '李强', 'type': '人名', 'is_primary': True}, {'text': '张韶涵', 'type': '人名', 'is_primary': True}, {'text': '张若宇', 'type': '人名', 'is_primary': True}, {'text': '白锦程', 'type': '人名', 'is_primary': True}, ] } ] } mapping = processor._generate_masked_mapping(unique_entities, linkage) # Verify the mapping results assert mapping['李强'] == '李Q' assert mapping['张韶涵'] == '张SH' assert mapping['张若宇'] == '张RY' assert mapping['白锦程'] == '白JC' # Check that duplicates are handled correctly # The second occurrence should be numbered assert '李Q2' in mapping.values() or '张SH2' in mapping.values() def test_lawyer_and_judge_names(): """Test that lawyer and judge names follow the same Chinese name rules""" processor = NerProcessor() # Test lawyer and judge names test_entities = [ {'text': '王律师', 'type': '律师姓名'}, {'text': '李法官', 'type': '审判人员姓名'}, {'text': '张检察官', 'type': '检察官姓名'}, ] linkage = { 'entity_groups': [ { 'group_id': 'g1', 'group_type': '律师姓名', 'entities': [{'text': '王律师', 'type': '律师姓名', 'is_primary': True}] }, { 'group_id': 'g2', 'group_type': '审判人员姓名', 'entities': [{'text': '李法官', 'type': '审判人员姓名', 'is_primary': True}] }, { 'group_id': 'g3', 'group_type': '检察官姓名', 'entities': [{'text': '张检察官', 'type': '检察官姓名', 'is_primary': True}] } ] } mapping = processor._generate_masked_mapping(test_entities, linkage) # These should follow the same Chinese name masking rules assert mapping['王律师'] == '王L' assert mapping['李法官'] == '李F' assert mapping['张检察官'] == '张JC' def test_company_name_masking(): """Test company name masking with business name extraction""" processor = NerProcessor() # Test basic company name masking test_cases = [ ("上海盒马网络科技有限公司", "上海JO网络科技有限公司"), ("丰田通商(上海)有限公司", "HVVU(上海)有限公司"), ("雅诗兰黛(上海)商贸有限公司", "AUNF(上海)商贸有限公司"), ("北京百度网讯科技有限公司", "北京BC网讯科技有限公司"), ("腾讯科技(深圳)有限公司", "TU科技(深圳)有限公司"), ("阿里巴巴集团控股有限公司", "阿里巴巴集团控股有限公司"), # 商号可能无法正确提取 ] for original_name, expected_masked in test_cases: masked = processor._mask_company_name(original_name) print(f"{original_name} -> {masked} (expected: {expected_masked})") # Note: The exact results may vary due to LLM extraction, so we'll just print for verification def test_business_name_extraction(): """Test business name extraction from company names""" processor = NerProcessor() # Test business name extraction test_cases = [ ("上海盒马网络科技有限公司", "盒马"), ("丰田通商(上海)有限公司", "丰田通商"), ("雅诗兰黛(上海)商贸有限公司", "雅诗兰黛"), ("北京百度网讯科技有限公司", "百度"), ("腾讯科技(深圳)有限公司", "腾讯"), ("律师事务所", "律师事务所"), # Edge case ] for company_name, expected_business_name in test_cases: business_name = processor._extract_business_name(company_name) print(f"Company: {company_name} -> Business Name: {business_name} (expected: {expected_business_name})") # Note: The exact results may vary due to LLM extraction, so we'll just print for verification def test_json_validation_for_business_name(): """Test JSON validation for business name extraction responses""" from app.core.utils.llm_validator import LLMResponseValidator # Test valid JSON response valid_response = { "business_name": "盒马", "confidence": 0.9 } assert LLMResponseValidator.validate_business_name_extraction(valid_response) == True # Test invalid JSON response (missing required field) invalid_response = { "confidence": 0.9 } assert LLMResponseValidator.validate_business_name_extraction(invalid_response) == False # Test invalid JSON response (wrong type) invalid_response2 = { "business_name": 123, "confidence": 0.9 } assert LLMResponseValidator.validate_business_name_extraction(invalid_response2) == False def test_law_firm_masking(): """Test law firm name masking""" processor = NerProcessor() # Test law firm name masking test_cases = [ ("北京大成律师事务所", "北京D律师事务所"), ("上海锦天城律师事务所", "上海JTC律师事务所"), ("广东广信君达律师事务所", "广东GXJD律师事务所"), ] for original_name, expected_masked in test_cases: masked = processor._mask_company_name(original_name) print(f"{original_name} -> {masked} (expected: {expected_masked})") # Note: The exact results may vary due to LLM extraction, so we'll just print for verification