feat: 地址脱敏隐去门牌、街道、小区等

This commit is contained in:
tigermren 2025-08-17 15:30:52 +08:00
parent 437e010aee
commit 2c985bc963
3 changed files with 364 additions and 8 deletions

View File

@ -278,6 +278,184 @@ class NerProcessor:
# 这里可以根据具体的公司名称模式进行更复杂的处理
return company_name
def _extract_address_components(self, address: str) -> Dict[str, str]:
"""
使用LLM提取地址中的路名门牌号大厦名小区名
"""
prompt = f"""
你是一个专业的地址分析助手请从以下地址中提取需要脱敏的组件并严格按照JSON格式返回结果
地址{address}
脱敏规则
1. 保留区级以上地址县等
2. 路名路名需要脱敏以大写首字母替代
3. 门牌号门牌数字需要脱敏****代替
4. 大厦名小区名需要脱敏以大写首字母替代
示例
- 上海市静安区恒丰路66号白云大厦1607室
- 路名恒丰路
- 门牌号66
- 大厦名白云大厦
- 小区名
- 北京市朝阳区建国路88号SOHO现代城A座1001室
- 路名建国路
- 门牌号88
- 大厦名SOHO现代城
- 小区名
- 广州市天河区珠江新城花城大道123号富力中心B座2001室
- 路名花城大道
- 门牌号123
- 大厦名富力中心
- 小区名
请严格按照以下JSON格式输出不要包含任何其他文字
{{
"road_name": "提取的路名",
"house_number": "提取的门牌号",
"building_name": "提取的大厦名",
"community_name": "提取的小区名(如果没有则为空字符串)",
"confidence": 0.9
}}
注意
- road_name字段必须包含路名恒丰路建国路等
- house_number字段必须包含门牌号6688
- building_name字段必须包含大厦名白云大厦SOHO现代城等
- community_name字段包含小区名如果没有则为空字符串
- confidence字段是0-1之间的数字表示提取的置信度
- 必须严格按照JSON格式不要添加任何解释或额外文字
"""
try:
response = self.ollama_client.generate(prompt)
logger.info(f"Raw LLM response for address extraction: {response}")
# 使用JSON提取器解析响应
parsed_response = LLMJsonExtractor.parse_raw_json_str(response)
if parsed_response and LLMResponseValidator.validate_address_extraction(parsed_response):
logger.info(f"Successfully extracted address components: {parsed_response}")
return parsed_response
else:
logger.warning(f"Invalid JSON response for address extraction: {response}")
return self._extract_address_components_with_regex(address)
except Exception as e:
logger.error(f"LLM extraction failed: {e}")
return self._extract_address_components_with_regex(address)
def _extract_address_components_with_regex(self, address: str) -> Dict[str, str]:
"""
使用正则表达式提取地址组件回退方法
"""
# 路名模式:通常以"路"、"街"、"大道"等结尾
road_pattern = r'([^省市区县]+[路街大道巷弄])'
# 门牌号模式:数字+号
house_number_pattern = r'(\d+)号'
# 大厦名模式:通常包含"大厦"、"中心"、"广场"等
building_pattern = r'([^号室]+(?:大厦|中心|广场|城|楼|座))'
# 小区名模式:通常包含"小区"、"花园"、"苑"等
community_pattern = r'([^号室]+(?:小区|花园|苑|园|庭))'
road_name = ""
house_number = ""
building_name = ""
community_name = ""
# 提取路名
road_match = re.search(road_pattern, address)
if road_match:
road_name = road_match.group(1).strip()
# 提取门牌号
house_match = re.search(house_number_pattern, address)
if house_match:
house_number = house_match.group(1)
# 提取大厦名
building_match = re.search(building_pattern, address)
if building_match:
building_name = building_match.group(1).strip()
# 提取小区名
community_match = re.search(community_pattern, address)
if community_match:
community_name = community_match.group(1).strip()
return {
"road_name": road_name,
"house_number": house_number,
"building_name": building_name,
"community_name": community_name,
"confidence": 0.5 # 较低置信度,因为是回退方法
}
def _mask_address(self, address: str) -> str:
"""
对地址进行脱敏处理
保留区级以上地址路名以大写首字母替代门牌数字以****代替大厦名小区名以大写首字母替代
"""
if not address:
return address
# 提取地址组件
components = self._extract_address_components(address)
masked_address = address
# 替换路名
if components.get("road_name"):
road_name = components["road_name"]
# 获取路名的拼音首字母
try:
pinyin_list = pinyin(road_name, style=Style.NORMAL)
initials = ''.join([p[0][0].upper() for p in pinyin_list if p and p[0]])
masked_address = masked_address.replace(road_name, initials + "")
except Exception as e:
logger.warning(f"Failed to get pinyin for road name {road_name}: {e}")
# 如果拼音转换失败,使用原字符的首字母
masked_address = masked_address.replace(road_name, road_name[0].upper() + "")
# 替换门牌号
if components.get("house_number"):
house_number = components["house_number"]
masked_address = masked_address.replace(house_number + "", "**号")
# 替换大厦名
if components.get("building_name"):
building_name = components["building_name"]
# 获取大厦名的拼音首字母
try:
pinyin_list = pinyin(building_name, style=Style.NORMAL)
initials = ''.join([p[0][0].upper() for p in pinyin_list if p and p[0]])
masked_address = masked_address.replace(building_name, initials)
except Exception as e:
logger.warning(f"Failed to get pinyin for building name {building_name}: {e}")
# 如果拼音转换失败,使用原字符的首字母
masked_address = masked_address.replace(building_name, building_name[0].upper())
# 替换小区名
if components.get("community_name"):
community_name = components["community_name"]
# 获取小区名的拼音首字母
try:
pinyin_list = pinyin(community_name, style=Style.NORMAL)
initials = ''.join([p[0][0].upper() for p in pinyin_list if p and p[0]])
masked_address = masked_address.replace(community_name, initials)
except Exception as e:
logger.warning(f"Failed to get pinyin for community name {community_name}: {e}")
# 如果拼音转换失败,使用原字符的首字母
masked_address = masked_address.replace(community_name, community_name[0].upper())
return masked_address
def _process_entity_type(self, chunk: str, prompt_func, entity_type: str) -> Dict[str, str]:
for attempt in range(self.max_retries):
try:
@ -367,7 +545,7 @@ class NerProcessor:
7. 案号只替换案号中的数字部分为***保留前后结构和""支持中间有空格
8. 身份证号6位X
9. 社会信用代码8位X
10. 地址保留区级及以上行政区划去除详细位置
10. 地址保留区级以上地址路名以大写首字母替代门牌数字以****代替大厦名小区名以大写首字母替代上海市静安区恒丰路66号白云大厦1607室上海市静安区HF路**号BY大厦****
11. 其他类型按原有逻辑
"""
import re
@ -435,12 +613,8 @@ class NerProcessor:
entity_mapping[text] = masked
used_masked_names.add(masked)
elif '地址' in entity_type:
# 保留区级及以上行政区划,去除详细位置
match = re.match(admin_pattern, text)
if match:
masked = match.group(1)
else:
masked = text # fallback
# 使用新的地址脱敏方法
masked = self._mask_address(text)
entity_mapping[text] = masked
used_masked_names.add(masked)
elif '人名' in entity_type:

View File

@ -95,6 +95,36 @@ class LLMResponseValidator:
"required": ["business_name"]
}
# Schema for address extraction responses
ADDRESS_EXTRACTION_SCHEMA = {
"type": "object",
"properties": {
"road_name": {
"type": "string",
"description": "The road name (路名) to be masked"
},
"house_number": {
"type": "string",
"description": "The house number (门牌号) to be masked"
},
"building_name": {
"type": "string",
"description": "The building name (大厦名) to be masked"
},
"community_name": {
"type": "string",
"description": "The community name (小区名) to be masked"
},
"confidence": {
"type": "number",
"minimum": 0,
"maximum": 1,
"description": "Confidence level of the extraction (0-1)"
}
},
"required": ["road_name", "house_number", "building_name", "community_name"]
}
@classmethod
def validate_entity_extraction(cls, response: Dict[str, Any]) -> bool:
"""
@ -180,6 +210,26 @@ class LLMResponseValidator:
logger.warning(f"Response that failed validation: {response}")
return False
@classmethod
def validate_address_extraction(cls, response: Dict[str, Any]) -> bool:
"""
Validate address extraction response from LLM.
Args:
response: The parsed JSON response from LLM
Returns:
bool: True if valid, False otherwise
"""
try:
validate(instance=response, schema=cls.ADDRESS_EXTRACTION_SCHEMA)
logger.debug(f"Address extraction validation passed for response: {response}")
return True
except ValidationError as e:
logger.warning(f"Address extraction validation failed: {e}")
logger.warning(f"Response that failed validation: {response}")
return False
@classmethod
def _validate_linkage_content(cls, response: Dict[str, Any]) -> bool:
"""
@ -240,7 +290,8 @@ class LLMResponseValidator:
'entity_extraction': cls.validate_entity_extraction,
'entity_linkage': cls.validate_entity_linkage,
'regex_entity': cls.validate_regex_entity,
'business_name_extraction': cls.validate_business_name_extraction
'business_name_extraction': cls.validate_business_name_extraction,
'address_extraction': cls.validate_address_extraction
}
validator = validators.get(response_type)
@ -273,6 +324,8 @@ class LLMResponseValidator:
validate(instance=response, schema=cls.REGEX_ENTITY_SCHEMA)
elif response_type == 'business_name_extraction':
validate(instance=response, schema=cls.BUSINESS_NAME_EXTRACTION_SCHEMA)
elif response_type == 'address_extraction':
validate(instance=response, schema=cls.ADDRESS_EXTRACTION_SCHEMA)
else:
return f"Unknown response type: {response_type}"

View File

@ -0,0 +1,129 @@
#!/usr/bin/env python3
"""
Test file for address masking functionality
"""
import pytest
import sys
import os
# Add the backend directory to the Python path for imports
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
from app.core.document_handlers.ner_processor import NerProcessor
def test_address_masking():
"""Test address masking with the new rules"""
processor = NerProcessor()
# Test cases based on the requirements
test_cases = [
("上海市静安区恒丰路66号白云大厦1607室", "上海市静安区HF路**号BY大厦****室"),
("北京市朝阳区建国路88号SOHO现代城A座1001室", "北京市朝阳区JG路**号SOHO现代城A座****室"),
("广州市天河区珠江新城花城大道123号富力中心B座2001室", "广州市天河区珠江新城HC大道**号FL中心B座****室"),
("深圳市南山区科技园南区深南大道9988号腾讯大厦T1栋15楼", "深圳市南山区科技园南区SN大道**号TX大厦T1栋**楼"),
]
for original_address, expected_masked in test_cases:
masked = processor._mask_address(original_address)
print(f"Original: {original_address}")
print(f"Masked: {masked}")
print(f"Expected: {expected_masked}")
print("-" * 50)
# Note: The exact results may vary due to LLM extraction, so we'll just print for verification
def test_address_component_extraction():
"""Test address component extraction"""
processor = NerProcessor()
# Test address component extraction
test_cases = [
("上海市静安区恒丰路66号白云大厦1607室", {
"road_name": "恒丰路",
"house_number": "66",
"building_name": "白云大厦",
"community_name": ""
}),
("北京市朝阳区建国路88号SOHO现代城A座1001室", {
"road_name": "建国路",
"house_number": "88",
"building_name": "SOHO现代城",
"community_name": ""
}),
]
for address, expected_components in test_cases:
components = processor._extract_address_components(address)
print(f"Address: {address}")
print(f"Extracted components: {components}")
print(f"Expected: {expected_components}")
print("-" * 50)
# Note: The exact results may vary due to LLM extraction, so we'll just print for verification
def test_regex_fallback():
"""Test regex fallback for address extraction"""
processor = NerProcessor()
# Test regex extraction (fallback method)
test_address = "上海市静安区恒丰路66号白云大厦1607室"
components = processor._extract_address_components_with_regex(test_address)
print(f"Address: {test_address}")
print(f"Regex extracted components: {components}")
# Basic validation
assert "road_name" in components
assert "house_number" in components
assert "building_name" in components
assert "community_name" in components
assert "confidence" in components
def test_json_validation_for_address():
"""Test JSON validation for address extraction responses"""
from app.core.utils.llm_validator import LLMResponseValidator
# Test valid JSON response
valid_response = {
"road_name": "恒丰路",
"house_number": "66",
"building_name": "白云大厦",
"community_name": "",
"confidence": 0.9
}
assert LLMResponseValidator.validate_address_extraction(valid_response) == True
# Test invalid JSON response (missing required field)
invalid_response = {
"road_name": "恒丰路",
"house_number": "66",
"building_name": "白云大厦",
"confidence": 0.9
}
assert LLMResponseValidator.validate_address_extraction(invalid_response) == False
# Test invalid JSON response (wrong type)
invalid_response2 = {
"road_name": 123,
"house_number": "66",
"building_name": "白云大厦",
"community_name": "",
"confidence": 0.9
}
assert LLMResponseValidator.validate_address_extraction(invalid_response2) == False
if __name__ == "__main__":
print("Testing Address Masking Functionality")
print("=" * 50)
test_regex_fallback()
print()
test_json_validation_for_address()
print()
test_address_component_extraction()
print()
test_address_masking()