refine: 新身份证、社会安全代码脱敏规则
This commit is contained in:
parent
2c985bc963
commit
1dd2f3884c
|
|
@ -543,8 +543,8 @@ class NerProcessor:
|
|||
5. 英文公司名:替换为所属行业名称,英文大写(如无行业信息,默认 COMPANY);
|
||||
6. 项目名:项目名称变为小写英文字母(如 a项目、b项目...);
|
||||
7. 案号:只替换案号中的数字部分为***,保留前后结构和"号"字,支持中间有空格;
|
||||
8. 身份证号:6位X;
|
||||
9. 社会信用代码:8位X;
|
||||
8. 身份证号:保留首6位,其他位数变为"X"(如:310103198802080000→310103XXXXXXXXXXXX);
|
||||
9. 社会信用代码:保留首7位,其他位数变为"X"(如:9133021276453538XT→913302XXXXXXXXXXXX);
|
||||
10. 地址:保留区级以上地址,路名以大写首字母替代,门牌数字以****代替,大厦名、小区名以大写首字母替代(如:上海市静安区恒丰路66号白云大厦1607室→上海市静安区HF路**号BY大厦****室);
|
||||
11. 其他类型按原有逻辑。
|
||||
"""
|
||||
|
|
@ -605,11 +605,19 @@ class NerProcessor:
|
|||
entity_mapping[text] = masked
|
||||
used_masked_names.add(masked)
|
||||
elif '身份证号' in entity_type:
|
||||
masked = 'X' * 6
|
||||
# 保留首6位,其他位数变为"X"
|
||||
if len(text) >= 6:
|
||||
masked = text[:6] + 'X' * (len(text) - 6)
|
||||
else:
|
||||
masked = text # fallback for invalid length
|
||||
entity_mapping[text] = masked
|
||||
used_masked_names.add(masked)
|
||||
elif '社会信用代码' in entity_type:
|
||||
masked = 'X' * 8
|
||||
# 保留首7位,其他位数变为"X"
|
||||
if len(text) >= 7:
|
||||
masked = text[:7] + 'X' * (len(text) - 7)
|
||||
else:
|
||||
masked = text # fallback for invalid length
|
||||
entity_mapping[text] = masked
|
||||
used_masked_names.add(masked)
|
||||
elif '地址' in entity_type:
|
||||
|
|
|
|||
|
|
@ -1 +0,0 @@
|
|||
关于张三天和北京易见天树有限公司的劳动纠纷
|
||||
|
|
@ -1,70 +0,0 @@
|
|||
import pytest
|
||||
import logging
|
||||
import sys
|
||||
import os
|
||||
|
||||
# Add the backend directory to the Python path for imports
|
||||
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
||||
|
||||
# Configure logging
|
||||
logging.basicConfig(level=logging.INFO)
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def sql_step():
|
||||
assert 1 == 1
|
||||
return ""
|
||||
|
||||
|
||||
|
||||
def test_sql_insert_step_execute():
|
||||
"""
|
||||
Integration test with a real database connection.
|
||||
Note: This test requires a running database instance
|
||||
"""
|
||||
# Skip this test if no database is available
|
||||
# pytest.skip("Skipping integration test - requires database setup")
|
||||
|
||||
# Set inputs
|
||||
assert 1 == 1
|
||||
|
||||
|
||||
def test_simple_assertion():
|
||||
"""Simple test to verify pytest is working"""
|
||||
assert 1 == 1
|
||||
assert 2 + 2 == 4
|
||||
assert "hello" == "hello"
|
||||
|
||||
|
||||
def test_string_operations():
|
||||
"""Test string operations"""
|
||||
text = "hello world"
|
||||
assert len(text) == 11
|
||||
assert text.upper() == "HELLO WORLD"
|
||||
assert text.split()[0] == "hello"
|
||||
|
||||
|
||||
def test_basic_math():
|
||||
"""Test basic mathematical operations"""
|
||||
assert 1 + 1 == 2
|
||||
assert 5 * 5 == 25
|
||||
assert 10 / 2 == 5
|
||||
assert 2 ** 3 == 8
|
||||
|
||||
|
||||
def test_list_operations():
|
||||
"""Test list operations"""
|
||||
my_list = [1, 2, 3, 4, 5]
|
||||
assert len(my_list) == 5
|
||||
assert my_list[0] == 1
|
||||
assert my_list[-1] == 5
|
||||
assert sum(my_list) == 15
|
||||
|
||||
|
||||
def test_with_fixture(sample_data):
|
||||
"""Test using a fixture"""
|
||||
assert sample_data["name"] == "test"
|
||||
assert sample_data["value"] == 42
|
||||
assert len(sample_data["items"]) == 3
|
||||
assert sample_data["items"][0] == 1
|
||||
|
|
@ -0,0 +1,169 @@
|
|||
#!/usr/bin/env python3
|
||||
"""
|
||||
Test file for ID and social credit code masking functionality
|
||||
"""
|
||||
|
||||
import pytest
|
||||
import sys
|
||||
import os
|
||||
|
||||
# Add the backend directory to the Python path for imports
|
||||
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
||||
|
||||
from app.core.document_handlers.ner_processor import NerProcessor
|
||||
|
||||
|
||||
def test_id_number_masking():
|
||||
"""Test ID number masking with the new rules"""
|
||||
processor = NerProcessor()
|
||||
|
||||
# Test cases based on the requirements
|
||||
test_cases = [
|
||||
("310103198802080000", "310103XXXXXXXXXXXX"),
|
||||
("110101199001011234", "110101XXXXXXXXXXXX"),
|
||||
("440301199505151234", "440301XXXXXXXXXXXX"),
|
||||
("320102198712345678", "320102XXXXXXXXXXXX"),
|
||||
("12345", "12345"), # Edge case: too short
|
||||
]
|
||||
|
||||
for original_id, expected_masked in test_cases:
|
||||
# Create a mock entity for testing
|
||||
entity = {'text': original_id, 'type': '身份证号'}
|
||||
unique_entities = [entity]
|
||||
linkage = {'entity_groups': []}
|
||||
|
||||
# Test the masking through the full pipeline
|
||||
mapping = processor._generate_masked_mapping(unique_entities, linkage)
|
||||
masked = mapping.get(original_id, original_id)
|
||||
|
||||
print(f"Original ID: {original_id}")
|
||||
print(f"Masked ID: {masked}")
|
||||
print(f"Expected: {expected_masked}")
|
||||
print(f"Match: {masked == expected_masked}")
|
||||
print("-" * 50)
|
||||
|
||||
|
||||
def test_social_credit_code_masking():
|
||||
"""Test social credit code masking with the new rules"""
|
||||
processor = NerProcessor()
|
||||
|
||||
# Test cases based on the requirements
|
||||
test_cases = [
|
||||
("9133021276453538XT", "913302XXXXXXXXXXXX"),
|
||||
("91110000100000000X", "9111000XXXXXXXXXXX"),
|
||||
("914403001922038216", "9144030XXXXXXXXXXX"),
|
||||
("91310000132209458G", "9131000XXXXXXXXXXX"),
|
||||
("123456", "123456"), # Edge case: too short
|
||||
]
|
||||
|
||||
for original_code, expected_masked in test_cases:
|
||||
# Create a mock entity for testing
|
||||
entity = {'text': original_code, 'type': '社会信用代码'}
|
||||
unique_entities = [entity]
|
||||
linkage = {'entity_groups': []}
|
||||
|
||||
# Test the masking through the full pipeline
|
||||
mapping = processor._generate_masked_mapping(unique_entities, linkage)
|
||||
masked = mapping.get(original_code, original_code)
|
||||
|
||||
print(f"Original Code: {original_code}")
|
||||
print(f"Masked Code: {masked}")
|
||||
print(f"Expected: {expected_masked}")
|
||||
print(f"Match: {masked == expected_masked}")
|
||||
print("-" * 50)
|
||||
|
||||
|
||||
def test_edge_cases():
|
||||
"""Test edge cases for ID and social credit code masking"""
|
||||
processor = NerProcessor()
|
||||
|
||||
# Test edge cases
|
||||
edge_cases = [
|
||||
("", ""), # Empty string
|
||||
("123", "123"), # Too short for ID
|
||||
("123456", "123456"), # Too short for social credit code
|
||||
("123456789012345678901234567890", "123456XXXXXXXXXXXXXXXXXX"), # Very long ID
|
||||
]
|
||||
|
||||
for original, expected in edge_cases:
|
||||
# Test ID number
|
||||
entity_id = {'text': original, 'type': '身份证号'}
|
||||
mapping_id = processor._generate_masked_mapping([entity_id], {'entity_groups': []})
|
||||
masked_id = mapping_id.get(original, original)
|
||||
|
||||
# Test social credit code
|
||||
entity_code = {'text': original, 'type': '社会信用代码'}
|
||||
mapping_code = processor._generate_masked_mapping([entity_code], {'entity_groups': []})
|
||||
masked_code = mapping_code.get(original, original)
|
||||
|
||||
print(f"Original: {original}")
|
||||
print(f"ID Masked: {masked_id}")
|
||||
print(f"Code Masked: {masked_code}")
|
||||
print("-" * 30)
|
||||
|
||||
|
||||
def test_mixed_entities():
|
||||
"""Test masking with mixed entity types"""
|
||||
processor = NerProcessor()
|
||||
|
||||
# Create mixed entities
|
||||
entities = [
|
||||
{'text': '310103198802080000', 'type': '身份证号'},
|
||||
{'text': '9133021276453538XT', 'type': '社会信用代码'},
|
||||
{'text': '李强', 'type': '人名'},
|
||||
{'text': '上海盒马网络科技有限公司', 'type': '公司名称'},
|
||||
]
|
||||
|
||||
linkage = {'entity_groups': []}
|
||||
|
||||
# Test the masking through the full pipeline
|
||||
mapping = processor._generate_masked_mapping(entities, linkage)
|
||||
|
||||
print("Mixed Entities Test:")
|
||||
print("=" * 30)
|
||||
for entity in entities:
|
||||
original = entity['text']
|
||||
entity_type = entity['type']
|
||||
masked = mapping.get(original, original)
|
||||
print(f"{entity_type}: {original} -> {masked}")
|
||||
|
||||
def test_id_masking():
|
||||
"""Test ID number and social credit code masking"""
|
||||
from app.core.document_handlers.ner_processor import NerProcessor
|
||||
|
||||
processor = NerProcessor()
|
||||
|
||||
# Test ID number masking
|
||||
id_entity = {'text': '310103198802080000', 'type': '身份证号'}
|
||||
id_mapping = processor._generate_masked_mapping([id_entity], {'entity_groups': []})
|
||||
masked_id = id_mapping.get('310103198802080000', '')
|
||||
|
||||
# Test social credit code masking
|
||||
code_entity = {'text': '9133021276453538XT', 'type': '社会信用代码'}
|
||||
code_mapping = processor._generate_masked_mapping([code_entity], {'entity_groups': []})
|
||||
masked_code = code_mapping.get('9133021276453538XT', '')
|
||||
|
||||
# Verify the masking rules
|
||||
assert masked_id.startswith('310103') # First 6 digits preserved
|
||||
assert masked_id.endswith('XXXXXXXXXXXX') # Rest masked with X
|
||||
assert len(masked_id) == 18 # Total length preserved
|
||||
|
||||
assert masked_code.startswith('913302') # First 7 digits preserved
|
||||
assert masked_code.endswith('XXXXXXXXXXXX') # Rest masked with X
|
||||
assert len(masked_code) == 18 # Total length preserved
|
||||
|
||||
print(f"ID masking: 310103198802080000 -> {masked_id}")
|
||||
print(f"Code masking: 9133021276453538XT -> {masked_code}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
print("Testing ID and Social Credit Code Masking")
|
||||
print("=" * 50)
|
||||
|
||||
test_id_number_masking()
|
||||
print()
|
||||
test_social_credit_code_masking()
|
||||
print()
|
||||
test_edge_cases()
|
||||
print()
|
||||
test_mixed_entities()
|
||||
Loading…
Reference in New Issue