legal-doc-masker/backend/tests/test_address_masking.py

130 lines
4.4 KiB
Python

#!/usr/bin/env python3
"""
Test file for address masking functionality
"""
import pytest
import sys
import os
# Add the backend directory to the Python path for imports
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
from app.core.document_handlers.ner_processor import NerProcessor
def test_address_masking():
"""Test address masking with the new rules"""
processor = NerProcessor()
# Test cases based on the requirements
test_cases = [
("上海市静安区恒丰路66号白云大厦1607室", "上海市静安区HF路**号BY大厦****室"),
("北京市朝阳区建国路88号SOHO现代城A座1001室", "北京市朝阳区JG路**号SOHO现代城A座****室"),
("广州市天河区珠江新城花城大道123号富力中心B座2001室", "广州市天河区珠江新城HC大道**号FL中心B座****室"),
("深圳市南山区科技园南区深南大道9988号腾讯大厦T1栋15楼", "深圳市南山区科技园南区SN大道**号TX大厦T1栋**楼"),
]
for original_address, expected_masked in test_cases:
masked = processor._mask_address(original_address)
print(f"Original: {original_address}")
print(f"Masked: {masked}")
print(f"Expected: {expected_masked}")
print("-" * 50)
# Note: The exact results may vary due to LLM extraction, so we'll just print for verification
def test_address_component_extraction():
"""Test address component extraction"""
processor = NerProcessor()
# Test address component extraction
test_cases = [
("上海市静安区恒丰路66号白云大厦1607室", {
"road_name": "恒丰路",
"house_number": "66",
"building_name": "白云大厦",
"community_name": ""
}),
("北京市朝阳区建国路88号SOHO现代城A座1001室", {
"road_name": "建国路",
"house_number": "88",
"building_name": "SOHO现代城",
"community_name": ""
}),
]
for address, expected_components in test_cases:
components = processor._extract_address_components(address)
print(f"Address: {address}")
print(f"Extracted components: {components}")
print(f"Expected: {expected_components}")
print("-" * 50)
# Note: The exact results may vary due to LLM extraction, so we'll just print for verification
def test_regex_fallback():
"""Test regex fallback for address extraction"""
processor = NerProcessor()
# Test regex extraction (fallback method)
test_address = "上海市静安区恒丰路66号白云大厦1607室"
components = processor._extract_address_components_with_regex(test_address)
print(f"Address: {test_address}")
print(f"Regex extracted components: {components}")
# Basic validation
assert "road_name" in components
assert "house_number" in components
assert "building_name" in components
assert "community_name" in components
assert "confidence" in components
def test_json_validation_for_address():
"""Test JSON validation for address extraction responses"""
from app.core.utils.llm_validator import LLMResponseValidator
# Test valid JSON response
valid_response = {
"road_name": "恒丰路",
"house_number": "66",
"building_name": "白云大厦",
"community_name": "",
"confidence": 0.9
}
assert LLMResponseValidator.validate_address_extraction(valid_response) == True
# Test invalid JSON response (missing required field)
invalid_response = {
"road_name": "恒丰路",
"house_number": "66",
"building_name": "白云大厦",
"confidence": 0.9
}
assert LLMResponseValidator.validate_address_extraction(invalid_response) == False
# Test invalid JSON response (wrong type)
invalid_response2 = {
"road_name": 123,
"house_number": "66",
"building_name": "白云大厦",
"community_name": "",
"confidence": 0.9
}
assert LLMResponseValidator.validate_address_extraction(invalid_response2) == False
if __name__ == "__main__":
print("Testing Address Masking Functionality")
print("=" * 50)
test_regex_fallback()
print()
test_json_validation_for_address()
print()
test_address_component_extraction()
print()
test_address_masking()