#!/usr/bin/env python3 """ Test to verify the multiple occurrence issue in apply_entity_masking_with_alignment. """ def find_entity_alignment(entity_text: str, original_document_text: str): """Simplified version of the alignment method for testing""" clean_entity = entity_text.replace(" ", "") doc_chars = [c for c in original_document_text if c != ' '] for i in range(len(doc_chars) - len(clean_entity) + 1): if doc_chars[i:i+len(clean_entity)] == list(clean_entity): return map_char_positions_to_original(i, len(clean_entity), original_document_text) return None def map_char_positions_to_original(clean_start: int, entity_length: int, original_text: str): """Simplified version of position mapping for testing""" original_pos = 0 clean_pos = 0 while clean_pos < clean_start and original_pos < len(original_text): if original_text[original_pos] != ' ': clean_pos += 1 original_pos += 1 start_pos = original_pos chars_found = 0 while chars_found < entity_length and original_pos < len(original_text): if original_text[original_pos] != ' ': chars_found += 1 original_pos += 1 end_pos = original_pos found_text = original_text[start_pos:end_pos] return start_pos, end_pos, found_text def apply_entity_masking_with_alignment_current(original_document_text: str, entity_mapping: dict): """Current implementation with the bug""" masked_document = original_document_text sorted_entities = sorted(entity_mapping.keys(), key=len, reverse=True) for entity_text in sorted_entities: masked_text = entity_mapping[entity_text] # Find the entity in the original document using alignment alignment_result = find_entity_alignment(entity_text, masked_document) if alignment_result: start_pos, end_pos, found_text = alignment_result # Replace the found text with the masked version masked_document = ( masked_document[:start_pos] + masked_text + masked_document[end_pos:] ) print(f"Masked entity '{entity_text}' -> '{masked_text}' at positions {start_pos}-{end_pos}") else: print(f"Could not find entity '{entity_text}' in document for masking") return masked_document def test_multiple_occurrences(): """Test the multiple occurrence issue""" print("Testing Multiple Occurrence Issue") print("=" * 50) # Test document with multiple occurrences of the same entity test_document = "上诉人李淼因合同纠纷,法定代表人李淼,委托代理人李淼。" entity_mapping = { "李淼": "李M" } print(f"Original document: {test_document}") print(f"Entity mapping: {entity_mapping}") print(f"Expected: All 3 occurrences of '李淼' should be masked") # Test current implementation result = apply_entity_masking_with_alignment_current(test_document, entity_mapping) print(f"Current result: {result}") # Count remaining occurrences remaining_count = result.count("李淼") print(f"Remaining '李淼' occurrences: {remaining_count}") if remaining_count > 0: print("❌ ISSUE CONFIRMED: Multiple occurrences are not being masked!") else: print("✅ No issue found (unexpected)") if __name__ == "__main__": test_multiple_occurrences()