legal-doc-masker/backend/tests/test_final_fix.py

187 lines
8.2 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env python3
"""
Final test to verify the fix handles multiple occurrences and prevents infinite loops.
"""
def find_entity_alignment(entity_text: str, original_document_text: str):
"""Simplified version of the alignment method for testing"""
clean_entity = entity_text.replace(" ", "")
doc_chars = [c for c in original_document_text if c != ' ']
for i in range(len(doc_chars) - len(clean_entity) + 1):
if doc_chars[i:i+len(clean_entity)] == list(clean_entity):
return map_char_positions_to_original(i, len(clean_entity), original_document_text)
return None
def map_char_positions_to_original(clean_start: int, entity_length: int, original_text: str):
"""Simplified version of position mapping for testing"""
original_pos = 0
clean_pos = 0
while clean_pos < clean_start and original_pos < len(original_text):
if original_text[original_pos] != ' ':
clean_pos += 1
original_pos += 1
start_pos = original_pos
chars_found = 0
while chars_found < entity_length and original_pos < len(original_text):
if original_text[original_pos] != ' ':
chars_found += 1
original_pos += 1
end_pos = original_pos
found_text = original_text[start_pos:end_pos]
return start_pos, end_pos, found_text
def apply_entity_masking_with_alignment_fixed(original_document_text: str, entity_mapping: dict):
"""Fixed implementation that handles multiple occurrences and prevents infinite loops"""
masked_document = original_document_text
sorted_entities = sorted(entity_mapping.keys(), key=len, reverse=True)
for entity_text in sorted_entities:
masked_text = entity_mapping[entity_text]
# Skip if masked text is the same as original text (prevents infinite loop)
if entity_text == masked_text:
print(f"Skipping entity '{entity_text}' as masked text is identical")
continue
# Find ALL occurrences of this entity in the document
# Add safety counter to prevent infinite loops
max_iterations = 100 # Safety limit
iteration_count = 0
while iteration_count < max_iterations:
iteration_count += 1
# Find the entity in the current masked document using alignment
alignment_result = find_entity_alignment(entity_text, masked_document)
if alignment_result:
start_pos, end_pos, found_text = alignment_result
# Replace the found text with the masked version
masked_document = (
masked_document[:start_pos] +
masked_text +
masked_document[end_pos:]
)
print(f"Masked entity '{entity_text}' -> '{masked_text}' at positions {start_pos}-{end_pos} (iteration {iteration_count})")
else:
# No more occurrences found for this entity, move to next entity
print(f"No more occurrences of '{entity_text}' found in document after {iteration_count} iterations")
break
# Log warning if we hit the safety limit
if iteration_count >= max_iterations:
print(f"WARNING: Reached maximum iterations ({max_iterations}) for entity '{entity_text}', stopping to prevent infinite loop")
return masked_document
def test_final_fix():
"""Test the final fix with various scenarios"""
print("Testing Final Fix for Multiple Occurrences and Infinite Loop Prevention")
print("=" * 70)
# Test case 1: Multiple occurrences of the same entity (should work)
print("\nTest Case 1: Multiple occurrences of same entity")
test_document_1 = "上诉人李淼因合同纠纷,法定代表人李淼,委托代理人李淼。"
entity_mapping_1 = {"李淼": "李M"}
print(f"Original: {test_document_1}")
result_1 = apply_entity_masking_with_alignment_fixed(test_document_1, entity_mapping_1)
print(f"Result: {result_1}")
remaining_1 = result_1.count("李淼")
expected_1 = "上诉人李M因合同纠纷法定代表人李M委托代理人李M。"
if result_1 == expected_1 and remaining_1 == 0:
print("✅ PASS: All occurrences masked correctly")
else:
print(f"❌ FAIL: Expected '{expected_1}', got '{result_1}'")
print(f" Remaining '李淼' occurrences: {remaining_1}")
# Test case 2: Entity with same masked text (should skip to prevent infinite loop)
print("\nTest Case 2: Entity with same masked text (should skip)")
test_document_2 = "上诉人李淼因合同纠纷,法定代表人李淼。北京丰复久信营销科技有限公司,丰复久信公司。"
entity_mapping_2 = {
"李淼": "李M",
"丰复久信公司": "丰复久信公司" # Same text - should be skipped
}
print(f"Original: {test_document_2}")
result_2 = apply_entity_masking_with_alignment_fixed(test_document_2, entity_mapping_2)
print(f"Result: {result_2}")
remaining_2_li = result_2.count("李淼")
remaining_2_company = result_2.count("丰复久信公司")
if remaining_2_li == 0 and remaining_2_company == 1: # Company should remain unmasked
print("✅ PASS: Infinite loop prevented, only different text masked")
else:
print(f"❌ FAIL: Remaining '李淼': {remaining_2_li}, '丰复久信公司': {remaining_2_company}")
# Test case 3: Mixed spacing scenarios
print("\nTest Case 3: Mixed spacing scenarios")
test_document_3 = "上诉人李 淼因合同纠纷,法定代表人李淼,委托代理人李 淼。"
entity_mapping_3 = {"李 淼": "李M", "李淼": "李M"}
print(f"Original: {test_document_3}")
result_3 = apply_entity_masking_with_alignment_fixed(test_document_3, entity_mapping_3)
print(f"Result: {result_3}")
remaining_3 = result_3.count("李淼") + result_3.count("李 淼")
if remaining_3 == 0:
print("✅ PASS: Mixed spacing handled correctly")
else:
print(f"❌ FAIL: Remaining occurrences: {remaining_3}")
# Test case 4: Complex document with real examples
print("\nTest Case 4: Complex document with real examples")
test_document_4 = """上诉人原审原告北京丰复久信营销科技有限公司住所地北京市海淀区北小马厂6号1号楼华天大厦1306室。
法定代表人:郭东军,执行董事、经理。
委托诉讼代理人:周大海,北京市康达律师事务所律师。
委托诉讼代理人:王乃哲,北京市康达律师事务所律师。
被上诉人原审被告中研智创区块链技术有限公司住所地天津市津南区双港镇工业园区优谷产业园5号楼-1505。
法定代表人:王欢子,总经理。
委托诉讼代理人:魏鑫,北京市昊衡律师事务所律师。"""
entity_mapping_4 = {
"北京丰复久信营销科技有限公司": "北京JO营销科技有限公司",
"郭东军": "郭DJ",
"周大海": "周DH",
"王乃哲": "王NZ",
"中研智创区块链技术有限公司": "中研智创区块链技术有限公司", # Same text - should be skipped
"王欢子": "王HZ",
"魏鑫": "魏X",
"北京市康达律师事务所": "北京市KD律师事务所",
"北京市昊衡律师事务所": "北京市HH律师事务所"
}
print(f"Original length: {len(test_document_4)} characters")
result_4 = apply_entity_masking_with_alignment_fixed(test_document_4, entity_mapping_4)
print(f"Result length: {len(result_4)} characters")
# Check that entities were masked correctly
unmasked_entities = []
for entity in entity_mapping_4.keys():
if entity in result_4 and entity != entity_mapping_4[entity]: # Skip if masked text is same
unmasked_entities.append(entity)
if not unmasked_entities:
print("✅ PASS: All entities masked correctly in complex document")
else:
print(f"❌ FAIL: Unmasked entities: {unmasked_entities}")
print("\n" + "=" * 70)
print("Final Fix Verification Completed!")
if __name__ == "__main__":
test_final_fix()