legal-doc-masker/backend/tests/test_fix_verification.py

174 lines
7.4 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env python3
"""
Test to verify the fix for multiple occurrence issue in apply_entity_masking_with_alignment.
"""
def find_entity_alignment(entity_text: str, original_document_text: str):
"""Simplified version of the alignment method for testing"""
clean_entity = entity_text.replace(" ", "")
doc_chars = [c for c in original_document_text if c != ' ']
for i in range(len(doc_chars) - len(clean_entity) + 1):
if doc_chars[i:i+len(clean_entity)] == list(clean_entity):
return map_char_positions_to_original(i, len(clean_entity), original_document_text)
return None
def map_char_positions_to_original(clean_start: int, entity_length: int, original_text: str):
"""Simplified version of position mapping for testing"""
original_pos = 0
clean_pos = 0
while clean_pos < clean_start and original_pos < len(original_text):
if original_text[original_pos] != ' ':
clean_pos += 1
original_pos += 1
start_pos = original_pos
chars_found = 0
while chars_found < entity_length and original_pos < len(original_text):
if original_text[original_pos] != ' ':
chars_found += 1
original_pos += 1
end_pos = original_pos
found_text = original_text[start_pos:end_pos]
return start_pos, end_pos, found_text
def apply_entity_masking_with_alignment_fixed(original_document_text: str, entity_mapping: dict):
"""Fixed implementation that handles multiple occurrences"""
masked_document = original_document_text
sorted_entities = sorted(entity_mapping.keys(), key=len, reverse=True)
for entity_text in sorted_entities:
masked_text = entity_mapping[entity_text]
# Find ALL occurrences of this entity in the document
# We need to loop until no more matches are found
while True:
# Find the entity in the current masked document using alignment
alignment_result = find_entity_alignment(entity_text, masked_document)
if alignment_result:
start_pos, end_pos, found_text = alignment_result
# Replace the found text with the masked version
masked_document = (
masked_document[:start_pos] +
masked_text +
masked_document[end_pos:]
)
print(f"Masked entity '{entity_text}' -> '{masked_text}' at positions {start_pos}-{end_pos}")
else:
# No more occurrences found for this entity, move to next entity
print(f"No more occurrences of '{entity_text}' found in document")
break
return masked_document
def test_fix_verification():
"""Test to verify the fix works correctly"""
print("Testing Fix for Multiple Occurrence Issue")
print("=" * 60)
# Test case 1: Multiple occurrences of the same entity
print("\nTest Case 1: Multiple occurrences of same entity")
test_document_1 = "上诉人李淼因合同纠纷,法定代表人李淼,委托代理人李淼。"
entity_mapping_1 = {"李淼": "李M"}
print(f"Original: {test_document_1}")
result_1 = apply_entity_masking_with_alignment_fixed(test_document_1, entity_mapping_1)
print(f"Result: {result_1}")
remaining_1 = result_1.count("李淼")
expected_1 = "上诉人李M因合同纠纷法定代表人李M委托代理人李M。"
if result_1 == expected_1 and remaining_1 == 0:
print("✅ PASS: All occurrences masked correctly")
else:
print(f"❌ FAIL: Expected '{expected_1}', got '{result_1}'")
print(f" Remaining '李淼' occurrences: {remaining_1}")
# Test case 2: Multiple entities with multiple occurrences
print("\nTest Case 2: Multiple entities with multiple occurrences")
test_document_2 = "上诉人李淼因合同纠纷,法定代表人李淼。北京丰复久信营销科技有限公司,丰复久信公司。"
entity_mapping_2 = {
"李淼": "李M",
"北京丰复久信营销科技有限公司": "北京JO营销科技有限公司",
"丰复久信公司": "丰复久信公司"
}
print(f"Original: {test_document_2}")
result_2 = apply_entity_masking_with_alignment_fixed(test_document_2, entity_mapping_2)
print(f"Result: {result_2}")
remaining_2_li = result_2.count("李淼")
remaining_2_company = result_2.count("北京丰复久信营销科技有限公司")
if remaining_2_li == 0 and remaining_2_company == 0:
print("✅ PASS: All entities masked correctly")
else:
print(f"❌ FAIL: Remaining '李淼': {remaining_2_li}, '北京丰复久信营销科技有限公司': {remaining_2_company}")
# Test case 3: Mixed spacing scenarios
print("\nTest Case 3: Mixed spacing scenarios")
test_document_3 = "上诉人李 淼因合同纠纷,法定代表人李淼,委托代理人李 淼。"
entity_mapping_3 = {"李 淼": "李M", "李淼": "李M"}
print(f"Original: {test_document_3}")
result_3 = apply_entity_masking_with_alignment_fixed(test_document_3, entity_mapping_3)
print(f"Result: {result_3}")
remaining_3 = result_3.count("李淼") + result_3.count("李 淼")
if remaining_3 == 0:
print("✅ PASS: Mixed spacing handled correctly")
else:
print(f"❌ FAIL: Remaining occurrences: {remaining_3}")
# Test case 4: Complex document with real examples
print("\nTest Case 4: Complex document with real examples")
test_document_4 = """上诉人原审原告北京丰复久信营销科技有限公司住所地北京市海淀区北小马厂6号1号楼华天大厦1306室。
法定代表人:郭东军,执行董事、经理。
委托诉讼代理人:周大海,北京市康达律师事务所律师。
委托诉讼代理人:王乃哲,北京市康达律师事务所律师。
被上诉人原审被告中研智创区块链技术有限公司住所地天津市津南区双港镇工业园区优谷产业园5号楼-1505。
法定代表人:王欢子,总经理。
委托诉讼代理人:魏鑫,北京市昊衡律师事务所律师。"""
entity_mapping_4 = {
"北京丰复久信营销科技有限公司": "北京JO营销科技有限公司",
"郭东军": "郭DJ",
"周大海": "周DH",
"王乃哲": "王NZ",
"中研智创区块链技术有限公司": "中研智创区块链技术有限公司",
"王欢子": "王HZ",
"魏鑫": "魏X",
"北京市康达律师事务所": "北京市KD律师事务所",
"北京市昊衡律师事务所": "北京市HH律师事务所"
}
print(f"Original length: {len(test_document_4)} characters")
result_4 = apply_entity_masking_with_alignment_fixed(test_document_4, entity_mapping_4)
print(f"Result length: {len(result_4)} characters")
# Check that all entities were masked
unmasked_entities = []
for entity in entity_mapping_4.keys():
if entity in result_4:
unmasked_entities.append(entity)
if not unmasked_entities:
print("✅ PASS: All entities masked in complex document")
else:
print(f"❌ FAIL: Unmasked entities: {unmasked_entities}")
print("\n" + "=" * 60)
print("Fix Verification Completed!")
if __name__ == "__main__":
test_fix_verification()