refine: 新身份证、社会安全代码脱敏规则

This commit is contained in:
tigermren 2025-08-17 15:59:12 +08:00
parent 2c985bc963
commit 1dd2f3884c
4 changed files with 181 additions and 75 deletions

View File

@ -543,8 +543,8 @@ class NerProcessor:
5. 英文公司名替换为所属行业名称英文大写如无行业信息默认 COMPANY
6. 项目名项目名称变为小写英文字母 a项目b项目...
7. 案号只替换案号中的数字部分为***保留前后结构和""支持中间有空格
8. 身份证号6位X
9. 社会信用代码8位X
8. 身份证号保留首6位其他位数变为"X"310103198802080000310103XXXXXXXXXXXX
9. 社会信用代码保留首7位其他位数变为"X"9133021276453538XT913302XXXXXXXXXXXX
10. 地址保留区级以上地址路名以大写首字母替代门牌数字以****代替大厦名小区名以大写首字母替代上海市静安区恒丰路66号白云大厦1607室上海市静安区HF路**号BY大厦****
11. 其他类型按原有逻辑
"""
@ -605,11 +605,19 @@ class NerProcessor:
entity_mapping[text] = masked
used_masked_names.add(masked)
elif '身份证号' in entity_type:
masked = 'X' * 6
# 保留首6位其他位数变为"X"
if len(text) >= 6:
masked = text[:6] + 'X' * (len(text) - 6)
else:
masked = text # fallback for invalid length
entity_mapping[text] = masked
used_masked_names.add(masked)
elif '社会信用代码' in entity_type:
masked = 'X' * 8
# 保留首7位其他位数变为"X"
if len(text) >= 7:
masked = text[:7] + 'X' * (len(text) - 7)
else:
masked = text # fallback for invalid length
entity_mapping[text] = masked
used_masked_names.add(masked)
elif '地址' in entity_type:

View File

@ -1 +0,0 @@
关于张三天和北京易见天树有限公司的劳动纠纷

View File

@ -1,70 +0,0 @@
import pytest
import logging
import sys
import os
# Add the backend directory to the Python path for imports
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
@pytest.fixture
def sql_step():
assert 1 == 1
return ""
def test_sql_insert_step_execute():
"""
Integration test with a real database connection.
Note: This test requires a running database instance
"""
# Skip this test if no database is available
# pytest.skip("Skipping integration test - requires database setup")
# Set inputs
assert 1 == 1
def test_simple_assertion():
"""Simple test to verify pytest is working"""
assert 1 == 1
assert 2 + 2 == 4
assert "hello" == "hello"
def test_string_operations():
"""Test string operations"""
text = "hello world"
assert len(text) == 11
assert text.upper() == "HELLO WORLD"
assert text.split()[0] == "hello"
def test_basic_math():
"""Test basic mathematical operations"""
assert 1 + 1 == 2
assert 5 * 5 == 25
assert 10 / 2 == 5
assert 2 ** 3 == 8
def test_list_operations():
"""Test list operations"""
my_list = [1, 2, 3, 4, 5]
assert len(my_list) == 5
assert my_list[0] == 1
assert my_list[-1] == 5
assert sum(my_list) == 15
def test_with_fixture(sample_data):
"""Test using a fixture"""
assert sample_data["name"] == "test"
assert sample_data["value"] == 42
assert len(sample_data["items"]) == 3
assert sample_data["items"][0] == 1

View File

@ -0,0 +1,169 @@
#!/usr/bin/env python3
"""
Test file for ID and social credit code masking functionality
"""
import pytest
import sys
import os
# Add the backend directory to the Python path for imports
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
from app.core.document_handlers.ner_processor import NerProcessor
def test_id_number_masking():
"""Test ID number masking with the new rules"""
processor = NerProcessor()
# Test cases based on the requirements
test_cases = [
("310103198802080000", "310103XXXXXXXXXXXX"),
("110101199001011234", "110101XXXXXXXXXXXX"),
("440301199505151234", "440301XXXXXXXXXXXX"),
("320102198712345678", "320102XXXXXXXXXXXX"),
("12345", "12345"), # Edge case: too short
]
for original_id, expected_masked in test_cases:
# Create a mock entity for testing
entity = {'text': original_id, 'type': '身份证号'}
unique_entities = [entity]
linkage = {'entity_groups': []}
# Test the masking through the full pipeline
mapping = processor._generate_masked_mapping(unique_entities, linkage)
masked = mapping.get(original_id, original_id)
print(f"Original ID: {original_id}")
print(f"Masked ID: {masked}")
print(f"Expected: {expected_masked}")
print(f"Match: {masked == expected_masked}")
print("-" * 50)
def test_social_credit_code_masking():
"""Test social credit code masking with the new rules"""
processor = NerProcessor()
# Test cases based on the requirements
test_cases = [
("9133021276453538XT", "913302XXXXXXXXXXXX"),
("91110000100000000X", "9111000XXXXXXXXXXX"),
("914403001922038216", "9144030XXXXXXXXXXX"),
("91310000132209458G", "9131000XXXXXXXXXXX"),
("123456", "123456"), # Edge case: too short
]
for original_code, expected_masked in test_cases:
# Create a mock entity for testing
entity = {'text': original_code, 'type': '社会信用代码'}
unique_entities = [entity]
linkage = {'entity_groups': []}
# Test the masking through the full pipeline
mapping = processor._generate_masked_mapping(unique_entities, linkage)
masked = mapping.get(original_code, original_code)
print(f"Original Code: {original_code}")
print(f"Masked Code: {masked}")
print(f"Expected: {expected_masked}")
print(f"Match: {masked == expected_masked}")
print("-" * 50)
def test_edge_cases():
"""Test edge cases for ID and social credit code masking"""
processor = NerProcessor()
# Test edge cases
edge_cases = [
("", ""), # Empty string
("123", "123"), # Too short for ID
("123456", "123456"), # Too short for social credit code
("123456789012345678901234567890", "123456XXXXXXXXXXXXXXXXXX"), # Very long ID
]
for original, expected in edge_cases:
# Test ID number
entity_id = {'text': original, 'type': '身份证号'}
mapping_id = processor._generate_masked_mapping([entity_id], {'entity_groups': []})
masked_id = mapping_id.get(original, original)
# Test social credit code
entity_code = {'text': original, 'type': '社会信用代码'}
mapping_code = processor._generate_masked_mapping([entity_code], {'entity_groups': []})
masked_code = mapping_code.get(original, original)
print(f"Original: {original}")
print(f"ID Masked: {masked_id}")
print(f"Code Masked: {masked_code}")
print("-" * 30)
def test_mixed_entities():
"""Test masking with mixed entity types"""
processor = NerProcessor()
# Create mixed entities
entities = [
{'text': '310103198802080000', 'type': '身份证号'},
{'text': '9133021276453538XT', 'type': '社会信用代码'},
{'text': '李强', 'type': '人名'},
{'text': '上海盒马网络科技有限公司', 'type': '公司名称'},
]
linkage = {'entity_groups': []}
# Test the masking through the full pipeline
mapping = processor._generate_masked_mapping(entities, linkage)
print("Mixed Entities Test:")
print("=" * 30)
for entity in entities:
original = entity['text']
entity_type = entity['type']
masked = mapping.get(original, original)
print(f"{entity_type}: {original} -> {masked}")
def test_id_masking():
"""Test ID number and social credit code masking"""
from app.core.document_handlers.ner_processor import NerProcessor
processor = NerProcessor()
# Test ID number masking
id_entity = {'text': '310103198802080000', 'type': '身份证号'}
id_mapping = processor._generate_masked_mapping([id_entity], {'entity_groups': []})
masked_id = id_mapping.get('310103198802080000', '')
# Test social credit code masking
code_entity = {'text': '9133021276453538XT', 'type': '社会信用代码'}
code_mapping = processor._generate_masked_mapping([code_entity], {'entity_groups': []})
masked_code = code_mapping.get('9133021276453538XT', '')
# Verify the masking rules
assert masked_id.startswith('310103') # First 6 digits preserved
assert masked_id.endswith('XXXXXXXXXXXX') # Rest masked with X
assert len(masked_id) == 18 # Total length preserved
assert masked_code.startswith('913302') # First 7 digits preserved
assert masked_code.endswith('XXXXXXXXXXXX') # Rest masked with X
assert len(masked_code) == 18 # Total length preserved
print(f"ID masking: 310103198802080000 -> {masked_id}")
print(f"Code masking: 9133021276453538XT -> {masked_code}")
if __name__ == "__main__":
print("Testing ID and Social Credit Code Masking")
print("=" * 50)
test_id_number_masking()
print()
test_social_credit_code_masking()
print()
test_edge_cases()
print()
test_mixed_entities()