legal-doc-masker/backend/tests/test_fix_verification.py

#!/usr/bin/env python3
"""
Test to verify the fix for multiple occurrence issue in apply_entity_masking_with_alignment.
"""

def find_entity_alignment(entity_text: str, original_document_text: str):
    """Simplified version of the alignment method for testing"""
    clean_entity = entity_text.replace(" ", "")
    doc_chars = [c for c in original_document_text if c != ' ']

    for i in range(len(doc_chars) - len(clean_entity) + 1):
        if doc_chars[i:i+len(clean_entity)] == list(clean_entity):
            return map_char_positions_to_original(i, len(clean_entity), original_document_text)
    return None

def map_char_positions_to_original(clean_start: int, entity_length: int, original_text: str):
    """Simplified version of position mapping for testing"""
    original_pos = 0
    clean_pos = 0

    while clean_pos < clean_start and original_pos < len(original_text):
        if original_text[original_pos] != ' ':
            clean_pos += 1
        original_pos += 1

    start_pos = original_pos

    chars_found = 0
    while chars_found < entity_length and original_pos < len(original_text):
        if original_text[original_pos] != ' ':
            chars_found += 1
        original_pos += 1

    end_pos = original_pos
    found_text = original_text[start_pos:end_pos]

    return start_pos, end_pos, found_text

def apply_entity_masking_with_alignment_fixed(original_document_text: str, entity_mapping: dict):
    """Fixed implementation that handles multiple occurrences"""
    masked_document = original_document_text
    sorted_entities = sorted(entity_mapping.keys(), key=len, reverse=True)

    for entity_text in sorted_entities:
        masked_text = entity_mapping[entity_text]

        # Find ALL occurrences of this entity in the document
        # We need to loop until no more matches are found
        while True:
            # Find the entity in the current masked document using alignment
            alignment_result = find_entity_alignment(entity_text, masked_document)

            if alignment_result:
                start_pos, end_pos, found_text = alignment_result

                # Replace the found text with the masked version
                masked_document = (
                    masked_document[:start_pos] +
                    masked_text +
                    masked_document[end_pos:]
                )

                print(f"Masked entity '{entity_text}' -> '{masked_text}' at positions {start_pos}-{end_pos}")
            else:
                # No more occurrences found for this entity, move to next entity
                print(f"No more occurrences of '{entity_text}' found in document")
                break

    return masked_document

def test_fix_verification():
    """Test to verify the fix works correctly"""

    print("Testing Fix for Multiple Occurrence Issue")
    print("=" * 60)

    # Test case 1: Multiple occurrences of the same entity
    print("\nTest Case 1: Multiple occurrences of same entity")
    test_document_1 = "上诉人李淼因合同纠纷，法定代表人李淼，委托代理人李淼。"
    entity_mapping_1 = {"李淼": "李M"}

    print(f"Original: {test_document_1}")
    result_1 = apply_entity_masking_with_alignment_fixed(test_document_1, entity_mapping_1)
    print(f"Result: {result_1}")

    remaining_1 = result_1.count("李淼")
    expected_1 = "上诉人李M因合同纠纷，法定代表人李M，委托代理人李M。"

    if result_1 == expected_1 and remaining_1 == 0:
        print("✅ PASS: All occurrences masked correctly")
    else:
        print(f"❌ FAIL: Expected '{expected_1}', got '{result_1}'")
        print(f"   Remaining '李淼' occurrences: {remaining_1}")

    # Test case 2: Multiple entities with multiple occurrences
    print("\nTest Case 2: Multiple entities with multiple occurrences")
    test_document_2 = "上诉人李淼因合同纠纷，法定代表人李淼。北京丰复久信营销科技有限公司，丰复久信公司。"
    entity_mapping_2 = {
        "李淼": "李M",
        "北京丰复久信营销科技有限公司": "北京JO营销科技有限公司",
        "丰复久信公司": "丰复久信公司"
    }

    print(f"Original: {test_document_2}")
    result_2 = apply_entity_masking_with_alignment_fixed(test_document_2, entity_mapping_2)
    print(f"Result: {result_2}")

    remaining_2_li = result_2.count("李淼")
    remaining_2_company = result_2.count("北京丰复久信营销科技有限公司")

    if remaining_2_li == 0 and remaining_2_company == 0:
        print("✅ PASS: All entities masked correctly")
    else:
        print(f"❌ FAIL: Remaining '李淼': {remaining_2_li}, '北京丰复久信营销科技有限公司': {remaining_2_company}")

    # Test case 3: Mixed spacing scenarios
    print("\nTest Case 3: Mixed spacing scenarios")
    test_document_3 = "上诉人李 淼因合同纠纷，法定代表人李淼，委托代理人李 淼。"
    entity_mapping_3 = {"李 淼": "李M", "李淼": "李M"}

    print(f"Original: {test_document_3}")
    result_3 = apply_entity_masking_with_alignment_fixed(test_document_3, entity_mapping_3)
    print(f"Result: {result_3}")

    remaining_3 = result_3.count("李淼") + result_3.count("李 淼")

    if remaining_3 == 0:
        print("✅ PASS: Mixed spacing handled correctly")
    else:
        print(f"❌ FAIL: Remaining occurrences: {remaining_3}")

    # Test case 4: Complex document with real examples
    print("\nTest Case 4: Complex document with real examples")
    test_document_4 = """上诉人（原审原告）：北京丰复久信营销科技有限公司，住所地北京市海淀区北小马厂6号1号楼华天大厦1306室。
法定代表人：郭东军，执行董事、经理。
委托诉讼代理人：周大海，北京市康达律师事务所律师。
委托诉讼代理人：王乃哲，北京市康达律师事务所律师。
被上诉人（原审被告）：中研智创区块链技术有限公司，住所地天津市津南区双港镇工业园区优谷产业园5号楼-1505。
法定代表人：王欢子，总经理。
委托诉讼代理人：魏鑫，北京市昊衡律师事务所律师。"""

    entity_mapping_4 = {
        "北京丰复久信营销科技有限公司": "北京JO营销科技有限公司",
        "郭东军": "郭DJ",
        "周大海": "周DH",
        "王乃哲": "王NZ",
        "中研智创区块链技术有限公司": "中研智创区块链技术有限公司",
        "王欢子": "王HZ",
        "魏鑫": "魏X",
        "北京市康达律师事务所": "北京市KD律师事务所",
        "北京市昊衡律师事务所": "北京市HH律师事务所"
    }

    print(f"Original length: {len(test_document_4)} characters")
    result_4 = apply_entity_masking_with_alignment_fixed(test_document_4, entity_mapping_4)
    print(f"Result length: {len(result_4)} characters")

    # Check that all entities were masked
    unmasked_entities = []
    for entity in entity_mapping_4.keys():
        if entity in result_4:
            unmasked_entities.append(entity)

    if not unmasked_entities:
        print("✅ PASS: All entities masked in complex document")
    else:
        print(f"❌ FAIL: Unmasked entities: {unmasked_entities}")

    print("\n" + "=" * 60)
    print("Fix Verification Completed!")

if __name__ == "__main__":
    test_fix_verification()