#!/usr/bin/env python3 """ Test to verify the fix for multiple occurrence issue in apply_entity_masking_with_alignment. """ def find_entity_alignment(entity_text: str, original_document_text: str): """Simplified version of the alignment method for testing""" clean_entity = entity_text.replace(" ", "") doc_chars = [c for c in original_document_text if c != ' '] for i in range(len(doc_chars) - len(clean_entity) + 1): if doc_chars[i:i+len(clean_entity)] == list(clean_entity): return map_char_positions_to_original(i, len(clean_entity), original_document_text) return None def map_char_positions_to_original(clean_start: int, entity_length: int, original_text: str): """Simplified version of position mapping for testing""" original_pos = 0 clean_pos = 0 while clean_pos < clean_start and original_pos < len(original_text): if original_text[original_pos] != ' ': clean_pos += 1 original_pos += 1 start_pos = original_pos chars_found = 0 while chars_found < entity_length and original_pos < len(original_text): if original_text[original_pos] != ' ': chars_found += 1 original_pos += 1 end_pos = original_pos found_text = original_text[start_pos:end_pos] return start_pos, end_pos, found_text def apply_entity_masking_with_alignment_fixed(original_document_text: str, entity_mapping: dict): """Fixed implementation that handles multiple occurrences""" masked_document = original_document_text sorted_entities = sorted(entity_mapping.keys(), key=len, reverse=True) for entity_text in sorted_entities: masked_text = entity_mapping[entity_text] # Find ALL occurrences of this entity in the document # We need to loop until no more matches are found while True: # Find the entity in the current masked document using alignment alignment_result = find_entity_alignment(entity_text, masked_document) if alignment_result: start_pos, end_pos, found_text = alignment_result # Replace the found text with the masked version masked_document = ( masked_document[:start_pos] + masked_text + masked_document[end_pos:] ) print(f"Masked entity '{entity_text}' -> '{masked_text}' at positions {start_pos}-{end_pos}") else: # No more occurrences found for this entity, move to next entity print(f"No more occurrences of '{entity_text}' found in document") break return masked_document def test_fix_verification(): """Test to verify the fix works correctly""" print("Testing Fix for Multiple Occurrence Issue") print("=" * 60) # Test case 1: Multiple occurrences of the same entity print("\nTest Case 1: Multiple occurrences of same entity") test_document_1 = "上诉人李淼因合同纠纷,法定代表人李淼,委托代理人李淼。" entity_mapping_1 = {"李淼": "李M"} print(f"Original: {test_document_1}") result_1 = apply_entity_masking_with_alignment_fixed(test_document_1, entity_mapping_1) print(f"Result: {result_1}") remaining_1 = result_1.count("李淼") expected_1 = "上诉人李M因合同纠纷,法定代表人李M,委托代理人李M。" if result_1 == expected_1 and remaining_1 == 0: print("✅ PASS: All occurrences masked correctly") else: print(f"❌ FAIL: Expected '{expected_1}', got '{result_1}'") print(f" Remaining '李淼' occurrences: {remaining_1}") # Test case 2: Multiple entities with multiple occurrences print("\nTest Case 2: Multiple entities with multiple occurrences") test_document_2 = "上诉人李淼因合同纠纷,法定代表人李淼。北京丰复久信营销科技有限公司,丰复久信公司。" entity_mapping_2 = { "李淼": "李M", "北京丰复久信营销科技有限公司": "北京JO营销科技有限公司", "丰复久信公司": "丰复久信公司" } print(f"Original: {test_document_2}") result_2 = apply_entity_masking_with_alignment_fixed(test_document_2, entity_mapping_2) print(f"Result: {result_2}") remaining_2_li = result_2.count("李淼") remaining_2_company = result_2.count("北京丰复久信营销科技有限公司") if remaining_2_li == 0 and remaining_2_company == 0: print("✅ PASS: All entities masked correctly") else: print(f"❌ FAIL: Remaining '李淼': {remaining_2_li}, '北京丰复久信营销科技有限公司': {remaining_2_company}") # Test case 3: Mixed spacing scenarios print("\nTest Case 3: Mixed spacing scenarios") test_document_3 = "上诉人李 淼因合同纠纷,法定代表人李淼,委托代理人李 淼。" entity_mapping_3 = {"李 淼": "李M", "李淼": "李M"} print(f"Original: {test_document_3}") result_3 = apply_entity_masking_with_alignment_fixed(test_document_3, entity_mapping_3) print(f"Result: {result_3}") remaining_3 = result_3.count("李淼") + result_3.count("李 淼") if remaining_3 == 0: print("✅ PASS: Mixed spacing handled correctly") else: print(f"❌ FAIL: Remaining occurrences: {remaining_3}") # Test case 4: Complex document with real examples print("\nTest Case 4: Complex document with real examples") test_document_4 = """上诉人(原审原告):北京丰复久信营销科技有限公司,住所地北京市海淀区北小马厂6号1号楼华天大厦1306室。 法定代表人:郭东军,执行董事、经理。 委托诉讼代理人:周大海,北京市康达律师事务所律师。 委托诉讼代理人:王乃哲,北京市康达律师事务所律师。 被上诉人(原审被告):中研智创区块链技术有限公司,住所地天津市津南区双港镇工业园区优谷产业园5号楼-1505。 法定代表人:王欢子,总经理。 委托诉讼代理人:魏鑫,北京市昊衡律师事务所律师。""" entity_mapping_4 = { "北京丰复久信营销科技有限公司": "北京JO营销科技有限公司", "郭东军": "郭DJ", "周大海": "周DH", "王乃哲": "王NZ", "中研智创区块链技术有限公司": "中研智创区块链技术有限公司", "王欢子": "王HZ", "魏鑫": "魏X", "北京市康达律师事务所": "北京市KD律师事务所", "北京市昊衡律师事务所": "北京市HH律师事务所" } print(f"Original length: {len(test_document_4)} characters") result_4 = apply_entity_masking_with_alignment_fixed(test_document_4, entity_mapping_4) print(f"Result length: {len(result_4)} characters") # Check that all entities were masked unmasked_entities = [] for entity in entity_mapping_4.keys(): if entity in result_4: unmasked_entities.append(entity) if not unmasked_entities: print("✅ PASS: All entities masked in complex document") else: print(f"❌ FAIL: Unmasked entities: {unmasked_entities}") print("\n" + "=" * 60) print("Fix Verification Completed!") if __name__ == "__main__": test_fix_verification()