legal-doc-masker/backend/tests/test_id_masking.py

170 lines
6.0 KiB
Python

#!/usr/bin/env python3
"""
Test file for ID and social credit code masking functionality
"""
import pytest
import sys
import os
# Add the backend directory to the Python path for imports
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
from app.core.document_handlers.ner_processor import NerProcessor
def test_id_number_masking():
"""Test ID number masking with the new rules"""
processor = NerProcessor()
# Test cases based on the requirements
test_cases = [
("310103198802080000", "310103XXXXXXXXXXXX"),
("110101199001011234", "110101XXXXXXXXXXXX"),
("440301199505151234", "440301XXXXXXXXXXXX"),
("320102198712345678", "320102XXXXXXXXXXXX"),
("12345", "12345"), # Edge case: too short
]
for original_id, expected_masked in test_cases:
# Create a mock entity for testing
entity = {'text': original_id, 'type': '身份证号'}
unique_entities = [entity]
linkage = {'entity_groups': []}
# Test the masking through the full pipeline
mapping = processor._generate_masked_mapping(unique_entities, linkage)
masked = mapping.get(original_id, original_id)
print(f"Original ID: {original_id}")
print(f"Masked ID: {masked}")
print(f"Expected: {expected_masked}")
print(f"Match: {masked == expected_masked}")
print("-" * 50)
def test_social_credit_code_masking():
"""Test social credit code masking with the new rules"""
processor = NerProcessor()
# Test cases based on the requirements
test_cases = [
("9133021276453538XT", "913302XXXXXXXXXXXX"),
("91110000100000000X", "9111000XXXXXXXXXXX"),
("914403001922038216", "9144030XXXXXXXXXXX"),
("91310000132209458G", "9131000XXXXXXXXXXX"),
("123456", "123456"), # Edge case: too short
]
for original_code, expected_masked in test_cases:
# Create a mock entity for testing
entity = {'text': original_code, 'type': '社会信用代码'}
unique_entities = [entity]
linkage = {'entity_groups': []}
# Test the masking through the full pipeline
mapping = processor._generate_masked_mapping(unique_entities, linkage)
masked = mapping.get(original_code, original_code)
print(f"Original Code: {original_code}")
print(f"Masked Code: {masked}")
print(f"Expected: {expected_masked}")
print(f"Match: {masked == expected_masked}")
print("-" * 50)
def test_edge_cases():
"""Test edge cases for ID and social credit code masking"""
processor = NerProcessor()
# Test edge cases
edge_cases = [
("", ""), # Empty string
("123", "123"), # Too short for ID
("123456", "123456"), # Too short for social credit code
("123456789012345678901234567890", "123456XXXXXXXXXXXXXXXXXX"), # Very long ID
]
for original, expected in edge_cases:
# Test ID number
entity_id = {'text': original, 'type': '身份证号'}
mapping_id = processor._generate_masked_mapping([entity_id], {'entity_groups': []})
masked_id = mapping_id.get(original, original)
# Test social credit code
entity_code = {'text': original, 'type': '社会信用代码'}
mapping_code = processor._generate_masked_mapping([entity_code], {'entity_groups': []})
masked_code = mapping_code.get(original, original)
print(f"Original: {original}")
print(f"ID Masked: {masked_id}")
print(f"Code Masked: {masked_code}")
print("-" * 30)
def test_mixed_entities():
"""Test masking with mixed entity types"""
processor = NerProcessor()
# Create mixed entities
entities = [
{'text': '310103198802080000', 'type': '身份证号'},
{'text': '9133021276453538XT', 'type': '社会信用代码'},
{'text': '李强', 'type': '人名'},
{'text': '上海盒马网络科技有限公司', 'type': '公司名称'},
]
linkage = {'entity_groups': []}
# Test the masking through the full pipeline
mapping = processor._generate_masked_mapping(entities, linkage)
print("Mixed Entities Test:")
print("=" * 30)
for entity in entities:
original = entity['text']
entity_type = entity['type']
masked = mapping.get(original, original)
print(f"{entity_type}: {original} -> {masked}")
def test_id_masking():
"""Test ID number and social credit code masking"""
from app.core.document_handlers.ner_processor import NerProcessor
processor = NerProcessor()
# Test ID number masking
id_entity = {'text': '310103198802080000', 'type': '身份证号'}
id_mapping = processor._generate_masked_mapping([id_entity], {'entity_groups': []})
masked_id = id_mapping.get('310103198802080000', '')
# Test social credit code masking
code_entity = {'text': '9133021276453538XT', 'type': '社会信用代码'}
code_mapping = processor._generate_masked_mapping([code_entity], {'entity_groups': []})
masked_code = code_mapping.get('9133021276453538XT', '')
# Verify the masking rules
assert masked_id.startswith('310103') # First 6 digits preserved
assert masked_id.endswith('XXXXXXXXXXXX') # Rest masked with X
assert len(masked_id) == 18 # Total length preserved
assert masked_code.startswith('913302') # First 7 digits preserved
assert masked_code.endswith('XXXXXXXXXXXX') # Rest masked with X
assert len(masked_code) == 18 # Total length preserved
print(f"ID masking: 310103198802080000 -> {masked_id}")
print(f"Code masking: 9133021276453538XT -> {masked_code}")
if __name__ == "__main__":
print("Testing ID and Social Credit Code Masking")
print("=" * 50)
test_id_number_masking()
print()
test_social_credit_code_masking()
print()
test_edge_cases()
print()
test_mixed_entities()