187 lines
8.2 KiB
Python
187 lines
8.2 KiB
Python
#!/usr/bin/env python3
|
||
"""
|
||
Final test to verify the fix handles multiple occurrences and prevents infinite loops.
|
||
"""
|
||
|
||
def find_entity_alignment(entity_text: str, original_document_text: str):
|
||
"""Simplified version of the alignment method for testing"""
|
||
clean_entity = entity_text.replace(" ", "")
|
||
doc_chars = [c for c in original_document_text if c != ' ']
|
||
|
||
for i in range(len(doc_chars) - len(clean_entity) + 1):
|
||
if doc_chars[i:i+len(clean_entity)] == list(clean_entity):
|
||
return map_char_positions_to_original(i, len(clean_entity), original_document_text)
|
||
return None
|
||
|
||
def map_char_positions_to_original(clean_start: int, entity_length: int, original_text: str):
|
||
"""Simplified version of position mapping for testing"""
|
||
original_pos = 0
|
||
clean_pos = 0
|
||
|
||
while clean_pos < clean_start and original_pos < len(original_text):
|
||
if original_text[original_pos] != ' ':
|
||
clean_pos += 1
|
||
original_pos += 1
|
||
|
||
start_pos = original_pos
|
||
|
||
chars_found = 0
|
||
while chars_found < entity_length and original_pos < len(original_text):
|
||
if original_text[original_pos] != ' ':
|
||
chars_found += 1
|
||
original_pos += 1
|
||
|
||
end_pos = original_pos
|
||
found_text = original_text[start_pos:end_pos]
|
||
|
||
return start_pos, end_pos, found_text
|
||
|
||
def apply_entity_masking_with_alignment_fixed(original_document_text: str, entity_mapping: dict):
|
||
"""Fixed implementation that handles multiple occurrences and prevents infinite loops"""
|
||
masked_document = original_document_text
|
||
sorted_entities = sorted(entity_mapping.keys(), key=len, reverse=True)
|
||
|
||
for entity_text in sorted_entities:
|
||
masked_text = entity_mapping[entity_text]
|
||
|
||
# Skip if masked text is the same as original text (prevents infinite loop)
|
||
if entity_text == masked_text:
|
||
print(f"Skipping entity '{entity_text}' as masked text is identical")
|
||
continue
|
||
|
||
# Find ALL occurrences of this entity in the document
|
||
# Add safety counter to prevent infinite loops
|
||
max_iterations = 100 # Safety limit
|
||
iteration_count = 0
|
||
|
||
while iteration_count < max_iterations:
|
||
iteration_count += 1
|
||
|
||
# Find the entity in the current masked document using alignment
|
||
alignment_result = find_entity_alignment(entity_text, masked_document)
|
||
|
||
if alignment_result:
|
||
start_pos, end_pos, found_text = alignment_result
|
||
|
||
# Replace the found text with the masked version
|
||
masked_document = (
|
||
masked_document[:start_pos] +
|
||
masked_text +
|
||
masked_document[end_pos:]
|
||
)
|
||
|
||
print(f"Masked entity '{entity_text}' -> '{masked_text}' at positions {start_pos}-{end_pos} (iteration {iteration_count})")
|
||
else:
|
||
# No more occurrences found for this entity, move to next entity
|
||
print(f"No more occurrences of '{entity_text}' found in document after {iteration_count} iterations")
|
||
break
|
||
|
||
# Log warning if we hit the safety limit
|
||
if iteration_count >= max_iterations:
|
||
print(f"WARNING: Reached maximum iterations ({max_iterations}) for entity '{entity_text}', stopping to prevent infinite loop")
|
||
|
||
return masked_document
|
||
|
||
def test_final_fix():
|
||
"""Test the final fix with various scenarios"""
|
||
|
||
print("Testing Final Fix for Multiple Occurrences and Infinite Loop Prevention")
|
||
print("=" * 70)
|
||
|
||
# Test case 1: Multiple occurrences of the same entity (should work)
|
||
print("\nTest Case 1: Multiple occurrences of same entity")
|
||
test_document_1 = "上诉人李淼因合同纠纷,法定代表人李淼,委托代理人李淼。"
|
||
entity_mapping_1 = {"李淼": "李M"}
|
||
|
||
print(f"Original: {test_document_1}")
|
||
result_1 = apply_entity_masking_with_alignment_fixed(test_document_1, entity_mapping_1)
|
||
print(f"Result: {result_1}")
|
||
|
||
remaining_1 = result_1.count("李淼")
|
||
expected_1 = "上诉人李M因合同纠纷,法定代表人李M,委托代理人李M。"
|
||
|
||
if result_1 == expected_1 and remaining_1 == 0:
|
||
print("✅ PASS: All occurrences masked correctly")
|
||
else:
|
||
print(f"❌ FAIL: Expected '{expected_1}', got '{result_1}'")
|
||
print(f" Remaining '李淼' occurrences: {remaining_1}")
|
||
|
||
# Test case 2: Entity with same masked text (should skip to prevent infinite loop)
|
||
print("\nTest Case 2: Entity with same masked text (should skip)")
|
||
test_document_2 = "上诉人李淼因合同纠纷,法定代表人李淼。北京丰复久信营销科技有限公司,丰复久信公司。"
|
||
entity_mapping_2 = {
|
||
"李淼": "李M",
|
||
"丰复久信公司": "丰复久信公司" # Same text - should be skipped
|
||
}
|
||
|
||
print(f"Original: {test_document_2}")
|
||
result_2 = apply_entity_masking_with_alignment_fixed(test_document_2, entity_mapping_2)
|
||
print(f"Result: {result_2}")
|
||
|
||
remaining_2_li = result_2.count("李淼")
|
||
remaining_2_company = result_2.count("丰复久信公司")
|
||
|
||
if remaining_2_li == 0 and remaining_2_company == 1: # Company should remain unmasked
|
||
print("✅ PASS: Infinite loop prevented, only different text masked")
|
||
else:
|
||
print(f"❌ FAIL: Remaining '李淼': {remaining_2_li}, '丰复久信公司': {remaining_2_company}")
|
||
|
||
# Test case 3: Mixed spacing scenarios
|
||
print("\nTest Case 3: Mixed spacing scenarios")
|
||
test_document_3 = "上诉人李 淼因合同纠纷,法定代表人李淼,委托代理人李 淼。"
|
||
entity_mapping_3 = {"李 淼": "李M", "李淼": "李M"}
|
||
|
||
print(f"Original: {test_document_3}")
|
||
result_3 = apply_entity_masking_with_alignment_fixed(test_document_3, entity_mapping_3)
|
||
print(f"Result: {result_3}")
|
||
|
||
remaining_3 = result_3.count("李淼") + result_3.count("李 淼")
|
||
|
||
if remaining_3 == 0:
|
||
print("✅ PASS: Mixed spacing handled correctly")
|
||
else:
|
||
print(f"❌ FAIL: Remaining occurrences: {remaining_3}")
|
||
|
||
# Test case 4: Complex document with real examples
|
||
print("\nTest Case 4: Complex document with real examples")
|
||
test_document_4 = """上诉人(原审原告):北京丰复久信营销科技有限公司,住所地北京市海淀区北小马厂6号1号楼华天大厦1306室。
|
||
法定代表人:郭东军,执行董事、经理。
|
||
委托诉讼代理人:周大海,北京市康达律师事务所律师。
|
||
委托诉讼代理人:王乃哲,北京市康达律师事务所律师。
|
||
被上诉人(原审被告):中研智创区块链技术有限公司,住所地天津市津南区双港镇工业园区优谷产业园5号楼-1505。
|
||
法定代表人:王欢子,总经理。
|
||
委托诉讼代理人:魏鑫,北京市昊衡律师事务所律师。"""
|
||
|
||
entity_mapping_4 = {
|
||
"北京丰复久信营销科技有限公司": "北京JO营销科技有限公司",
|
||
"郭东军": "郭DJ",
|
||
"周大海": "周DH",
|
||
"王乃哲": "王NZ",
|
||
"中研智创区块链技术有限公司": "中研智创区块链技术有限公司", # Same text - should be skipped
|
||
"王欢子": "王HZ",
|
||
"魏鑫": "魏X",
|
||
"北京市康达律师事务所": "北京市KD律师事务所",
|
||
"北京市昊衡律师事务所": "北京市HH律师事务所"
|
||
}
|
||
|
||
print(f"Original length: {len(test_document_4)} characters")
|
||
result_4 = apply_entity_masking_with_alignment_fixed(test_document_4, entity_mapping_4)
|
||
print(f"Result length: {len(result_4)} characters")
|
||
|
||
# Check that entities were masked correctly
|
||
unmasked_entities = []
|
||
for entity in entity_mapping_4.keys():
|
||
if entity in result_4 and entity != entity_mapping_4[entity]: # Skip if masked text is same
|
||
unmasked_entities.append(entity)
|
||
|
||
if not unmasked_entities:
|
||
print("✅ PASS: All entities masked correctly in complex document")
|
||
else:
|
||
print(f"❌ FAIL: Unmasked entities: {unmasked_entities}")
|
||
|
||
print("\n" + "=" * 70)
|
||
print("Final Fix Verification Completed!")
|
||
|
||
if __name__ == "__main__":
|
||
test_final_fix()
|