97 lines
3.5 KiB
Python
97 lines
3.5 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Test to verify the multiple occurrence issue in apply_entity_masking_with_alignment.
|
|
"""
|
|
|
|
def find_entity_alignment(entity_text: str, original_document_text: str):
|
|
"""Simplified version of the alignment method for testing"""
|
|
clean_entity = entity_text.replace(" ", "")
|
|
doc_chars = [c for c in original_document_text if c != ' ']
|
|
|
|
for i in range(len(doc_chars) - len(clean_entity) + 1):
|
|
if doc_chars[i:i+len(clean_entity)] == list(clean_entity):
|
|
return map_char_positions_to_original(i, len(clean_entity), original_document_text)
|
|
return None
|
|
|
|
def map_char_positions_to_original(clean_start: int, entity_length: int, original_text: str):
|
|
"""Simplified version of position mapping for testing"""
|
|
original_pos = 0
|
|
clean_pos = 0
|
|
|
|
while clean_pos < clean_start and original_pos < len(original_text):
|
|
if original_text[original_pos] != ' ':
|
|
clean_pos += 1
|
|
original_pos += 1
|
|
|
|
start_pos = original_pos
|
|
|
|
chars_found = 0
|
|
while chars_found < entity_length and original_pos < len(original_text):
|
|
if original_text[original_pos] != ' ':
|
|
chars_found += 1
|
|
original_pos += 1
|
|
|
|
end_pos = original_pos
|
|
found_text = original_text[start_pos:end_pos]
|
|
|
|
return start_pos, end_pos, found_text
|
|
|
|
def apply_entity_masking_with_alignment_current(original_document_text: str, entity_mapping: dict):
|
|
"""Current implementation with the bug"""
|
|
masked_document = original_document_text
|
|
sorted_entities = sorted(entity_mapping.keys(), key=len, reverse=True)
|
|
|
|
for entity_text in sorted_entities:
|
|
masked_text = entity_mapping[entity_text]
|
|
|
|
# Find the entity in the original document using alignment
|
|
alignment_result = find_entity_alignment(entity_text, masked_document)
|
|
|
|
if alignment_result:
|
|
start_pos, end_pos, found_text = alignment_result
|
|
|
|
# Replace the found text with the masked version
|
|
masked_document = (
|
|
masked_document[:start_pos] +
|
|
masked_text +
|
|
masked_document[end_pos:]
|
|
)
|
|
|
|
print(f"Masked entity '{entity_text}' -> '{masked_text}' at positions {start_pos}-{end_pos}")
|
|
else:
|
|
print(f"Could not find entity '{entity_text}' in document for masking")
|
|
|
|
return masked_document
|
|
|
|
def test_multiple_occurrences():
|
|
"""Test the multiple occurrence issue"""
|
|
|
|
print("Testing Multiple Occurrence Issue")
|
|
print("=" * 50)
|
|
|
|
# Test document with multiple occurrences of the same entity
|
|
test_document = "上诉人李淼因合同纠纷,法定代表人李淼,委托代理人李淼。"
|
|
entity_mapping = {
|
|
"李淼": "李M"
|
|
}
|
|
|
|
print(f"Original document: {test_document}")
|
|
print(f"Entity mapping: {entity_mapping}")
|
|
print(f"Expected: All 3 occurrences of '李淼' should be masked")
|
|
|
|
# Test current implementation
|
|
result = apply_entity_masking_with_alignment_current(test_document, entity_mapping)
|
|
print(f"Current result: {result}")
|
|
|
|
# Count remaining occurrences
|
|
remaining_count = result.count("李淼")
|
|
print(f"Remaining '李淼' occurrences: {remaining_count}")
|
|
|
|
if remaining_count > 0:
|
|
print("❌ ISSUE CONFIRMED: Multiple occurrences are not being masked!")
|
|
else:
|
|
print("✅ No issue found (unexpected)")
|
|
|
|
if __name__ == "__main__":
|
|
test_multiple_occurrences()
|