refine: 新身份证、社会安全代码脱敏规则

2025-08-17 15:59:12 +08:00 · 2025-08-17 15:59:12 +08:00 · 1dd2f3884c
parent 2c985bc963
commit 1dd2f3884c
4 changed files with 181 additions and 75 deletions
--- a/backend/app/core/document_handlers/ner_processor.py
+++ b/backend/app/core/document_handlers/ner_processor.py
@ -543,8 +543,8 @@ class NerProcessor:
        5. 英文公司名：替换为所属行业名称，英文大写（如无行业信息，默认 COMPANY）；
        6. 项目名：项目名称变为小写英文字母（如 a项目、b项目...）；
        7. 案号：只替换案号中的数字部分为***，保留前后结构和"号"字，支持中间有空格；
-        8. 身份证号：6位X；
-        9. 社会信用代码：8位X；
+        8. 身份证号：保留首6位，其他位数变为"X"（如：310103198802080000→310103XXXXXXXXXXXX）；
+        9. 社会信用代码：保留首7位，其他位数变为"X"（如：9133021276453538XT→913302XXXXXXXXXXXX）；
        10. 地址：保留区级以上地址，路名以大写首字母替代，门牌数字以****代替，大厦名、小区名以大写首字母替代（如：上海市静安区恒丰路66号白云大厦1607室→上海市静安区HF路**号BY大厦****室）；
        11. 其他类型按原有逻辑。
        """
@ -605,11 +605,19 @@ class NerProcessor:
                entity_mapping[text] = masked
                used_masked_names.add(masked)
            elif '身份证号' in entity_type:
-                masked = 'X' * 6
+                # 保留首6位，其他位数变为"X"
+                if len(text) >= 6:
+                    masked = text[:6] + 'X' * (len(text) - 6)
+                else:
+                    masked = text  # fallback for invalid length
                entity_mapping[text] = masked
                used_masked_names.add(masked)
            elif '社会信用代码' in entity_type:
-                masked = 'X' * 8
+                # 保留首7位，其他位数变为"X"
+                if len(text) >= 7:
+                    masked = text[:7] + 'X' * (len(text) - 7)
+                else:
+                    masked = text  # fallback for invalid length
                entity_mapping[text] = masked
                used_masked_names.add(masked)
            elif '地址' in entity_type:
--- a/backend/tests/test.txt
+++ b/backend/tests/test.txt
@ -1 +0,0 @@
-关于张三天和北京易见天树有限公司的劳动纠纷
--- a/backend/tests/test1.py
+++ b/backend/tests/test1.py
@ -1,70 +0,0 @@
-import pytest
-import logging
-import sys
-import os
-
-# Add the backend directory to the Python path for imports
-sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
-
-# Configure logging
-logging.basicConfig(level=logging.INFO)
-logger = logging.getLogger(__name__)
-
-
-@pytest.fixture
-def sql_step():
-    assert 1 == 1
-    return ""
-
-
-
-def test_sql_insert_step_execute():
-    """
-    Integration test with a real database connection.
-    Note: This test requires a running database instance
-    """
-    # Skip this test if no database is available
-    # pytest.skip("Skipping integration test - requires database setup")
-    
-    # Set inputs
-    assert 1 == 1
-
-
-def test_simple_assertion():
-    """Simple test to verify pytest is working"""
-    assert 1 == 1
-    assert 2 + 2 == 4
-    assert "hello" == "hello"
-
-
-def test_string_operations():
-    """Test string operations"""
-    text = "hello world"
-    assert len(text) == 11
-    assert text.upper() == "HELLO WORLD"
-    assert text.split()[0] == "hello"
-
-
-def test_basic_math():
-    """Test basic mathematical operations"""
-    assert 1 + 1 == 2
-    assert 5 * 5 == 25
-    assert 10 / 2 == 5
-    assert 2 ** 3 == 8
-
-
-def test_list_operations():
-    """Test list operations"""
-    my_list = [1, 2, 3, 4, 5]
-    assert len(my_list) == 5
-    assert my_list[0] == 1
-    assert my_list[-1] == 5
-    assert sum(my_list) == 15
-
-
-def test_with_fixture(sample_data):
-    """Test using a fixture"""
-    assert sample_data["name"] == "test"
-    assert sample_data["value"] == 42
-    assert len(sample_data["items"]) == 3
-    assert sample_data["items"][0] == 1
--- a/backend/tests/test_id_masking.py
+++ b/backend/tests/test_id_masking.py
@ -0,0 +1,169 @@
+#!/usr/bin/env python3
+"""
+Test file for ID and social credit code masking functionality
+"""
+
+import pytest
+import sys
+import os
+
+# Add the backend directory to the Python path for imports
+sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+
+from app.core.document_handlers.ner_processor import NerProcessor
+
+
+def test_id_number_masking():
+    """Test ID number masking with the new rules"""
+    processor = NerProcessor()
+    
+    # Test cases based on the requirements
+    test_cases = [
+        ("310103198802080000", "310103XXXXXXXXXXXX"),
+        ("110101199001011234", "110101XXXXXXXXXXXX"),
+        ("440301199505151234", "440301XXXXXXXXXXXX"),
+        ("320102198712345678", "320102XXXXXXXXXXXX"),
+        ("12345", "12345"),  # Edge case: too short
+    ]
+    
+    for original_id, expected_masked in test_cases:
+        # Create a mock entity for testing
+        entity = {'text': original_id, 'type': '身份证号'}
+        unique_entities = [entity]
+        linkage = {'entity_groups': []}
+        
+        # Test the masking through the full pipeline
+        mapping = processor._generate_masked_mapping(unique_entities, linkage)
+        masked = mapping.get(original_id, original_id)
+        
+        print(f"Original ID: {original_id}")
+        print(f"Masked ID:   {masked}")
+        print(f"Expected:    {expected_masked}")
+        print(f"Match:       {masked == expected_masked}")
+        print("-" * 50)
+
+
+def test_social_credit_code_masking():
+    """Test social credit code masking with the new rules"""
+    processor = NerProcessor()
+    
+    # Test cases based on the requirements
+    test_cases = [
+        ("9133021276453538XT", "913302XXXXXXXXXXXX"),
+        ("91110000100000000X", "9111000XXXXXXXXXXX"),
+        ("914403001922038216", "9144030XXXXXXXXXXX"),
+        ("91310000132209458G", "9131000XXXXXXXXXXX"),
+        ("123456", "123456"),  # Edge case: too short
+    ]
+    
+    for original_code, expected_masked in test_cases:
+        # Create a mock entity for testing
+        entity = {'text': original_code, 'type': '社会信用代码'}
+        unique_entities = [entity]
+        linkage = {'entity_groups': []}
+        
+        # Test the masking through the full pipeline
+        mapping = processor._generate_masked_mapping(unique_entities, linkage)
+        masked = mapping.get(original_code, original_code)
+        
+        print(f"Original Code: {original_code}")
+        print(f"Masked Code:   {masked}")
+        print(f"Expected:      {expected_masked}")
+        print(f"Match:         {masked == expected_masked}")
+        print("-" * 50)
+
+
+def test_edge_cases():
+    """Test edge cases for ID and social credit code masking"""
+    processor = NerProcessor()
+    
+    # Test edge cases
+    edge_cases = [
+        ("", ""),  # Empty string
+        ("123", "123"),  # Too short for ID
+        ("123456", "123456"),  # Too short for social credit code
+        ("123456789012345678901234567890", "123456XXXXXXXXXXXXXXXXXX"),  # Very long ID
+    ]
+    
+    for original, expected in edge_cases:
+        # Test ID number
+        entity_id = {'text': original, 'type': '身份证号'}
+        mapping_id = processor._generate_masked_mapping([entity_id], {'entity_groups': []})
+        masked_id = mapping_id.get(original, original)
+        
+        # Test social credit code
+        entity_code = {'text': original, 'type': '社会信用代码'}
+        mapping_code = processor._generate_masked_mapping([entity_code], {'entity_groups': []})
+        masked_code = mapping_code.get(original, original)
+        
+        print(f"Original: {original}")
+        print(f"ID Masked: {masked_id}")
+        print(f"Code Masked: {masked_code}")
+        print("-" * 30)
+
+
+def test_mixed_entities():
+    """Test masking with mixed entity types"""
+    processor = NerProcessor()
+    
+    # Create mixed entities
+    entities = [
+        {'text': '310103198802080000', 'type': '身份证号'},
+        {'text': '9133021276453538XT', 'type': '社会信用代码'},
+        {'text': '李强', 'type': '人名'},
+        {'text': '上海盒马网络科技有限公司', 'type': '公司名称'},
+    ]
+    
+    linkage = {'entity_groups': []}
+    
+    # Test the masking through the full pipeline
+    mapping = processor._generate_masked_mapping(entities, linkage)
+    
+    print("Mixed Entities Test:")
+    print("=" * 30)
+    for entity in entities:
+        original = entity['text']
+        entity_type = entity['type']
+        masked = mapping.get(original, original)
+        print(f"{entity_type}: {original} -> {masked}")
+
+def test_id_masking():
+    """Test ID number and social credit code masking"""
+    from app.core.document_handlers.ner_processor import NerProcessor
+    
+    processor = NerProcessor()
+    
+    # Test ID number masking
+    id_entity = {'text': '310103198802080000', 'type': '身份证号'}
+    id_mapping = processor._generate_masked_mapping([id_entity], {'entity_groups': []})
+    masked_id = id_mapping.get('310103198802080000', '')
+    
+    # Test social credit code masking
+    code_entity = {'text': '9133021276453538XT', 'type': '社会信用代码'}
+    code_mapping = processor._generate_masked_mapping([code_entity], {'entity_groups': []})
+    masked_code = code_mapping.get('9133021276453538XT', '')
+    
+    # Verify the masking rules
+    assert masked_id.startswith('310103')  # First 6 digits preserved
+    assert masked_id.endswith('XXXXXXXXXXXX')  # Rest masked with X
+    assert len(masked_id) == 18  # Total length preserved
+    
+    assert masked_code.startswith('913302')  # First 7 digits preserved
+    assert masked_code.endswith('XXXXXXXXXXXX')  # Rest masked with X
+    assert len(masked_code) == 18  # Total length preserved
+    
+    print(f"ID masking: 310103198802080000 -> {masked_id}")
+    print(f"Code masking: 9133021276453538XT -> {masked_code}")
+
+
+if __name__ == "__main__":
+    print("Testing ID and Social Credit Code Masking")
+    print("=" * 50)
+    
+    test_id_number_masking()
+    print()
+    test_social_credit_code_masking()
+    print()
+    test_edge_cases()
+    print()
+    test_mixed_entities()
				`@ -1 +0,0 @@`
				`关于张三天和北京易见天树有限公司的劳动纠纷`