dev #2
|
|
@ -278,6 +278,184 @@ class NerProcessor:
|
||||||
# 这里可以根据具体的公司名称模式进行更复杂的处理
|
# 这里可以根据具体的公司名称模式进行更复杂的处理
|
||||||
return company_name
|
return company_name
|
||||||
|
|
||||||
|
def _extract_address_components(self, address: str) -> Dict[str, str]:
|
||||||
|
"""
|
||||||
|
使用LLM提取地址中的路名、门牌号、大厦名、小区名
|
||||||
|
"""
|
||||||
|
prompt = f"""
|
||||||
|
你是一个专业的地址分析助手。请从以下地址中提取需要脱敏的组件,并严格按照JSON格式返回结果。
|
||||||
|
|
||||||
|
地址:{address}
|
||||||
|
|
||||||
|
脱敏规则:
|
||||||
|
1. 保留区级以上地址(省、市、区、县等)
|
||||||
|
2. 路名(路名)需要脱敏:以大写首字母替代
|
||||||
|
3. 门牌号(门牌数字)需要脱敏:以****代替
|
||||||
|
4. 大厦名、小区名需要脱敏:以大写首字母替代
|
||||||
|
|
||||||
|
示例:
|
||||||
|
- 上海市静安区恒丰路66号白云大厦1607室
|
||||||
|
- 路名:恒丰路
|
||||||
|
- 门牌号:66
|
||||||
|
- 大厦名:白云大厦
|
||||||
|
- 小区名:(空)
|
||||||
|
|
||||||
|
- 北京市朝阳区建国路88号SOHO现代城A座1001室
|
||||||
|
- 路名:建国路
|
||||||
|
- 门牌号:88
|
||||||
|
- 大厦名:SOHO现代城
|
||||||
|
- 小区名:(空)
|
||||||
|
|
||||||
|
- 广州市天河区珠江新城花城大道123号富力中心B座2001室
|
||||||
|
- 路名:花城大道
|
||||||
|
- 门牌号:123
|
||||||
|
- 大厦名:富力中心
|
||||||
|
- 小区名:(空)
|
||||||
|
|
||||||
|
请严格按照以下JSON格式输出,不要包含任何其他文字:
|
||||||
|
|
||||||
|
{{
|
||||||
|
"road_name": "提取的路名",
|
||||||
|
"house_number": "提取的门牌号",
|
||||||
|
"building_name": "提取的大厦名",
|
||||||
|
"community_name": "提取的小区名(如果没有则为空字符串)",
|
||||||
|
"confidence": 0.9
|
||||||
|
}}
|
||||||
|
|
||||||
|
注意:
|
||||||
|
- road_name字段必须包含路名(如:恒丰路、建国路等)
|
||||||
|
- house_number字段必须包含门牌号(如:66、88等)
|
||||||
|
- building_name字段必须包含大厦名(如:白云大厦、SOHO现代城等)
|
||||||
|
- community_name字段包含小区名,如果没有则为空字符串
|
||||||
|
- confidence字段是0-1之间的数字,表示提取的置信度
|
||||||
|
- 必须严格按照JSON格式,不要添加任何解释或额外文字
|
||||||
|
"""
|
||||||
|
|
||||||
|
try:
|
||||||
|
response = self.ollama_client.generate(prompt)
|
||||||
|
logger.info(f"Raw LLM response for address extraction: {response}")
|
||||||
|
|
||||||
|
# 使用JSON提取器解析响应
|
||||||
|
parsed_response = LLMJsonExtractor.parse_raw_json_str(response)
|
||||||
|
|
||||||
|
if parsed_response and LLMResponseValidator.validate_address_extraction(parsed_response):
|
||||||
|
logger.info(f"Successfully extracted address components: {parsed_response}")
|
||||||
|
return parsed_response
|
||||||
|
else:
|
||||||
|
logger.warning(f"Invalid JSON response for address extraction: {response}")
|
||||||
|
return self._extract_address_components_with_regex(address)
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"LLM extraction failed: {e}")
|
||||||
|
return self._extract_address_components_with_regex(address)
|
||||||
|
|
||||||
|
def _extract_address_components_with_regex(self, address: str) -> Dict[str, str]:
|
||||||
|
"""
|
||||||
|
使用正则表达式提取地址组件(回退方法)
|
||||||
|
"""
|
||||||
|
# 路名模式:通常以"路"、"街"、"大道"等结尾
|
||||||
|
road_pattern = r'([^省市区县]+[路街大道巷弄])'
|
||||||
|
|
||||||
|
# 门牌号模式:数字+号
|
||||||
|
house_number_pattern = r'(\d+)号'
|
||||||
|
|
||||||
|
# 大厦名模式:通常包含"大厦"、"中心"、"广场"等
|
||||||
|
building_pattern = r'([^号室]+(?:大厦|中心|广场|城|楼|座))'
|
||||||
|
|
||||||
|
# 小区名模式:通常包含"小区"、"花园"、"苑"等
|
||||||
|
community_pattern = r'([^号室]+(?:小区|花园|苑|园|庭))'
|
||||||
|
|
||||||
|
road_name = ""
|
||||||
|
house_number = ""
|
||||||
|
building_name = ""
|
||||||
|
community_name = ""
|
||||||
|
|
||||||
|
# 提取路名
|
||||||
|
road_match = re.search(road_pattern, address)
|
||||||
|
if road_match:
|
||||||
|
road_name = road_match.group(1).strip()
|
||||||
|
|
||||||
|
# 提取门牌号
|
||||||
|
house_match = re.search(house_number_pattern, address)
|
||||||
|
if house_match:
|
||||||
|
house_number = house_match.group(1)
|
||||||
|
|
||||||
|
# 提取大厦名
|
||||||
|
building_match = re.search(building_pattern, address)
|
||||||
|
if building_match:
|
||||||
|
building_name = building_match.group(1).strip()
|
||||||
|
|
||||||
|
# 提取小区名
|
||||||
|
community_match = re.search(community_pattern, address)
|
||||||
|
if community_match:
|
||||||
|
community_name = community_match.group(1).strip()
|
||||||
|
|
||||||
|
return {
|
||||||
|
"road_name": road_name,
|
||||||
|
"house_number": house_number,
|
||||||
|
"building_name": building_name,
|
||||||
|
"community_name": community_name,
|
||||||
|
"confidence": 0.5 # 较低置信度,因为是回退方法
|
||||||
|
}
|
||||||
|
|
||||||
|
def _mask_address(self, address: str) -> str:
|
||||||
|
"""
|
||||||
|
对地址进行脱敏处理:
|
||||||
|
保留区级以上地址,路名以大写首字母替代,门牌数字以****代替,大厦名、小区名以大写首字母替代
|
||||||
|
"""
|
||||||
|
if not address:
|
||||||
|
return address
|
||||||
|
|
||||||
|
# 提取地址组件
|
||||||
|
components = self._extract_address_components(address)
|
||||||
|
|
||||||
|
masked_address = address
|
||||||
|
|
||||||
|
# 替换路名
|
||||||
|
if components.get("road_name"):
|
||||||
|
road_name = components["road_name"]
|
||||||
|
# 获取路名的拼音首字母
|
||||||
|
try:
|
||||||
|
pinyin_list = pinyin(road_name, style=Style.NORMAL)
|
||||||
|
initials = ''.join([p[0][0].upper() for p in pinyin_list if p and p[0]])
|
||||||
|
masked_address = masked_address.replace(road_name, initials + "路")
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning(f"Failed to get pinyin for road name {road_name}: {e}")
|
||||||
|
# 如果拼音转换失败,使用原字符的首字母
|
||||||
|
masked_address = masked_address.replace(road_name, road_name[0].upper() + "路")
|
||||||
|
|
||||||
|
# 替换门牌号
|
||||||
|
if components.get("house_number"):
|
||||||
|
house_number = components["house_number"]
|
||||||
|
masked_address = masked_address.replace(house_number + "号", "**号")
|
||||||
|
|
||||||
|
# 替换大厦名
|
||||||
|
if components.get("building_name"):
|
||||||
|
building_name = components["building_name"]
|
||||||
|
# 获取大厦名的拼音首字母
|
||||||
|
try:
|
||||||
|
pinyin_list = pinyin(building_name, style=Style.NORMAL)
|
||||||
|
initials = ''.join([p[0][0].upper() for p in pinyin_list if p and p[0]])
|
||||||
|
masked_address = masked_address.replace(building_name, initials)
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning(f"Failed to get pinyin for building name {building_name}: {e}")
|
||||||
|
# 如果拼音转换失败,使用原字符的首字母
|
||||||
|
masked_address = masked_address.replace(building_name, building_name[0].upper())
|
||||||
|
|
||||||
|
# 替换小区名
|
||||||
|
if components.get("community_name"):
|
||||||
|
community_name = components["community_name"]
|
||||||
|
# 获取小区名的拼音首字母
|
||||||
|
try:
|
||||||
|
pinyin_list = pinyin(community_name, style=Style.NORMAL)
|
||||||
|
initials = ''.join([p[0][0].upper() for p in pinyin_list if p and p[0]])
|
||||||
|
masked_address = masked_address.replace(community_name, initials)
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning(f"Failed to get pinyin for community name {community_name}: {e}")
|
||||||
|
# 如果拼音转换失败,使用原字符的首字母
|
||||||
|
masked_address = masked_address.replace(community_name, community_name[0].upper())
|
||||||
|
|
||||||
|
return masked_address
|
||||||
|
|
||||||
def _process_entity_type(self, chunk: str, prompt_func, entity_type: str) -> Dict[str, str]:
|
def _process_entity_type(self, chunk: str, prompt_func, entity_type: str) -> Dict[str, str]:
|
||||||
for attempt in range(self.max_retries):
|
for attempt in range(self.max_retries):
|
||||||
try:
|
try:
|
||||||
|
|
@ -367,7 +545,7 @@ class NerProcessor:
|
||||||
7. 案号:只替换案号中的数字部分为***,保留前后结构和"号"字,支持中间有空格;
|
7. 案号:只替换案号中的数字部分为***,保留前后结构和"号"字,支持中间有空格;
|
||||||
8. 身份证号:6位X;
|
8. 身份证号:6位X;
|
||||||
9. 社会信用代码:8位X;
|
9. 社会信用代码:8位X;
|
||||||
10. 地址:保留区级及以上行政区划,去除详细位置;
|
10. 地址:保留区级以上地址,路名以大写首字母替代,门牌数字以****代替,大厦名、小区名以大写首字母替代(如:上海市静安区恒丰路66号白云大厦1607室→上海市静安区HF路**号BY大厦****室);
|
||||||
11. 其他类型按原有逻辑。
|
11. 其他类型按原有逻辑。
|
||||||
"""
|
"""
|
||||||
import re
|
import re
|
||||||
|
|
@ -435,12 +613,8 @@ class NerProcessor:
|
||||||
entity_mapping[text] = masked
|
entity_mapping[text] = masked
|
||||||
used_masked_names.add(masked)
|
used_masked_names.add(masked)
|
||||||
elif '地址' in entity_type:
|
elif '地址' in entity_type:
|
||||||
# 保留区级及以上行政区划,去除详细位置
|
# 使用新的地址脱敏方法
|
||||||
match = re.match(admin_pattern, text)
|
masked = self._mask_address(text)
|
||||||
if match:
|
|
||||||
masked = match.group(1)
|
|
||||||
else:
|
|
||||||
masked = text # fallback
|
|
||||||
entity_mapping[text] = masked
|
entity_mapping[text] = masked
|
||||||
used_masked_names.add(masked)
|
used_masked_names.add(masked)
|
||||||
elif '人名' in entity_type:
|
elif '人名' in entity_type:
|
||||||
|
|
|
||||||
|
|
@ -95,6 +95,36 @@ class LLMResponseValidator:
|
||||||
"required": ["business_name"]
|
"required": ["business_name"]
|
||||||
}
|
}
|
||||||
|
|
||||||
|
# Schema for address extraction responses
|
||||||
|
ADDRESS_EXTRACTION_SCHEMA = {
|
||||||
|
"type": "object",
|
||||||
|
"properties": {
|
||||||
|
"road_name": {
|
||||||
|
"type": "string",
|
||||||
|
"description": "The road name (路名) to be masked"
|
||||||
|
},
|
||||||
|
"house_number": {
|
||||||
|
"type": "string",
|
||||||
|
"description": "The house number (门牌号) to be masked"
|
||||||
|
},
|
||||||
|
"building_name": {
|
||||||
|
"type": "string",
|
||||||
|
"description": "The building name (大厦名) to be masked"
|
||||||
|
},
|
||||||
|
"community_name": {
|
||||||
|
"type": "string",
|
||||||
|
"description": "The community name (小区名) to be masked"
|
||||||
|
},
|
||||||
|
"confidence": {
|
||||||
|
"type": "number",
|
||||||
|
"minimum": 0,
|
||||||
|
"maximum": 1,
|
||||||
|
"description": "Confidence level of the extraction (0-1)"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"required": ["road_name", "house_number", "building_name", "community_name"]
|
||||||
|
}
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def validate_entity_extraction(cls, response: Dict[str, Any]) -> bool:
|
def validate_entity_extraction(cls, response: Dict[str, Any]) -> bool:
|
||||||
"""
|
"""
|
||||||
|
|
@ -180,6 +210,26 @@ class LLMResponseValidator:
|
||||||
logger.warning(f"Response that failed validation: {response}")
|
logger.warning(f"Response that failed validation: {response}")
|
||||||
return False
|
return False
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def validate_address_extraction(cls, response: Dict[str, Any]) -> bool:
|
||||||
|
"""
|
||||||
|
Validate address extraction response from LLM.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
response: The parsed JSON response from LLM
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
bool: True if valid, False otherwise
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
validate(instance=response, schema=cls.ADDRESS_EXTRACTION_SCHEMA)
|
||||||
|
logger.debug(f"Address extraction validation passed for response: {response}")
|
||||||
|
return True
|
||||||
|
except ValidationError as e:
|
||||||
|
logger.warning(f"Address extraction validation failed: {e}")
|
||||||
|
logger.warning(f"Response that failed validation: {response}")
|
||||||
|
return False
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def _validate_linkage_content(cls, response: Dict[str, Any]) -> bool:
|
def _validate_linkage_content(cls, response: Dict[str, Any]) -> bool:
|
||||||
"""
|
"""
|
||||||
|
|
@ -240,7 +290,8 @@ class LLMResponseValidator:
|
||||||
'entity_extraction': cls.validate_entity_extraction,
|
'entity_extraction': cls.validate_entity_extraction,
|
||||||
'entity_linkage': cls.validate_entity_linkage,
|
'entity_linkage': cls.validate_entity_linkage,
|
||||||
'regex_entity': cls.validate_regex_entity,
|
'regex_entity': cls.validate_regex_entity,
|
||||||
'business_name_extraction': cls.validate_business_name_extraction
|
'business_name_extraction': cls.validate_business_name_extraction,
|
||||||
|
'address_extraction': cls.validate_address_extraction
|
||||||
}
|
}
|
||||||
|
|
||||||
validator = validators.get(response_type)
|
validator = validators.get(response_type)
|
||||||
|
|
@ -273,6 +324,8 @@ class LLMResponseValidator:
|
||||||
validate(instance=response, schema=cls.REGEX_ENTITY_SCHEMA)
|
validate(instance=response, schema=cls.REGEX_ENTITY_SCHEMA)
|
||||||
elif response_type == 'business_name_extraction':
|
elif response_type == 'business_name_extraction':
|
||||||
validate(instance=response, schema=cls.BUSINESS_NAME_EXTRACTION_SCHEMA)
|
validate(instance=response, schema=cls.BUSINESS_NAME_EXTRACTION_SCHEMA)
|
||||||
|
elif response_type == 'address_extraction':
|
||||||
|
validate(instance=response, schema=cls.ADDRESS_EXTRACTION_SCHEMA)
|
||||||
else:
|
else:
|
||||||
return f"Unknown response type: {response_type}"
|
return f"Unknown response type: {response_type}"
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,129 @@
|
||||||
|
#!/usr/bin/env python3
|
||||||
|
"""
|
||||||
|
Test file for address masking functionality
|
||||||
|
"""
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
import sys
|
||||||
|
import os
|
||||||
|
|
||||||
|
# Add the backend directory to the Python path for imports
|
||||||
|
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
||||||
|
|
||||||
|
from app.core.document_handlers.ner_processor import NerProcessor
|
||||||
|
|
||||||
|
|
||||||
|
def test_address_masking():
|
||||||
|
"""Test address masking with the new rules"""
|
||||||
|
processor = NerProcessor()
|
||||||
|
|
||||||
|
# Test cases based on the requirements
|
||||||
|
test_cases = [
|
||||||
|
("上海市静安区恒丰路66号白云大厦1607室", "上海市静安区HF路**号BY大厦****室"),
|
||||||
|
("北京市朝阳区建国路88号SOHO现代城A座1001室", "北京市朝阳区JG路**号SOHO现代城A座****室"),
|
||||||
|
("广州市天河区珠江新城花城大道123号富力中心B座2001室", "广州市天河区珠江新城HC大道**号FL中心B座****室"),
|
||||||
|
("深圳市南山区科技园南区深南大道9988号腾讯大厦T1栋15楼", "深圳市南山区科技园南区SN大道**号TX大厦T1栋**楼"),
|
||||||
|
]
|
||||||
|
|
||||||
|
for original_address, expected_masked in test_cases:
|
||||||
|
masked = processor._mask_address(original_address)
|
||||||
|
print(f"Original: {original_address}")
|
||||||
|
print(f"Masked: {masked}")
|
||||||
|
print(f"Expected: {expected_masked}")
|
||||||
|
print("-" * 50)
|
||||||
|
# Note: The exact results may vary due to LLM extraction, so we'll just print for verification
|
||||||
|
|
||||||
|
|
||||||
|
def test_address_component_extraction():
|
||||||
|
"""Test address component extraction"""
|
||||||
|
processor = NerProcessor()
|
||||||
|
|
||||||
|
# Test address component extraction
|
||||||
|
test_cases = [
|
||||||
|
("上海市静安区恒丰路66号白云大厦1607室", {
|
||||||
|
"road_name": "恒丰路",
|
||||||
|
"house_number": "66",
|
||||||
|
"building_name": "白云大厦",
|
||||||
|
"community_name": ""
|
||||||
|
}),
|
||||||
|
("北京市朝阳区建国路88号SOHO现代城A座1001室", {
|
||||||
|
"road_name": "建国路",
|
||||||
|
"house_number": "88",
|
||||||
|
"building_name": "SOHO现代城",
|
||||||
|
"community_name": ""
|
||||||
|
}),
|
||||||
|
]
|
||||||
|
|
||||||
|
for address, expected_components in test_cases:
|
||||||
|
components = processor._extract_address_components(address)
|
||||||
|
print(f"Address: {address}")
|
||||||
|
print(f"Extracted components: {components}")
|
||||||
|
print(f"Expected: {expected_components}")
|
||||||
|
print("-" * 50)
|
||||||
|
# Note: The exact results may vary due to LLM extraction, so we'll just print for verification
|
||||||
|
|
||||||
|
|
||||||
|
def test_regex_fallback():
|
||||||
|
"""Test regex fallback for address extraction"""
|
||||||
|
processor = NerProcessor()
|
||||||
|
|
||||||
|
# Test regex extraction (fallback method)
|
||||||
|
test_address = "上海市静安区恒丰路66号白云大厦1607室"
|
||||||
|
components = processor._extract_address_components_with_regex(test_address)
|
||||||
|
|
||||||
|
print(f"Address: {test_address}")
|
||||||
|
print(f"Regex extracted components: {components}")
|
||||||
|
|
||||||
|
# Basic validation
|
||||||
|
assert "road_name" in components
|
||||||
|
assert "house_number" in components
|
||||||
|
assert "building_name" in components
|
||||||
|
assert "community_name" in components
|
||||||
|
assert "confidence" in components
|
||||||
|
|
||||||
|
|
||||||
|
def test_json_validation_for_address():
|
||||||
|
"""Test JSON validation for address extraction responses"""
|
||||||
|
from app.core.utils.llm_validator import LLMResponseValidator
|
||||||
|
|
||||||
|
# Test valid JSON response
|
||||||
|
valid_response = {
|
||||||
|
"road_name": "恒丰路",
|
||||||
|
"house_number": "66",
|
||||||
|
"building_name": "白云大厦",
|
||||||
|
"community_name": "",
|
||||||
|
"confidence": 0.9
|
||||||
|
}
|
||||||
|
assert LLMResponseValidator.validate_address_extraction(valid_response) == True
|
||||||
|
|
||||||
|
# Test invalid JSON response (missing required field)
|
||||||
|
invalid_response = {
|
||||||
|
"road_name": "恒丰路",
|
||||||
|
"house_number": "66",
|
||||||
|
"building_name": "白云大厦",
|
||||||
|
"confidence": 0.9
|
||||||
|
}
|
||||||
|
assert LLMResponseValidator.validate_address_extraction(invalid_response) == False
|
||||||
|
|
||||||
|
# Test invalid JSON response (wrong type)
|
||||||
|
invalid_response2 = {
|
||||||
|
"road_name": 123,
|
||||||
|
"house_number": "66",
|
||||||
|
"building_name": "白云大厦",
|
||||||
|
"community_name": "",
|
||||||
|
"confidence": 0.9
|
||||||
|
}
|
||||||
|
assert LLMResponseValidator.validate_address_extraction(invalid_response2) == False
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
print("Testing Address Masking Functionality")
|
||||||
|
print("=" * 50)
|
||||||
|
|
||||||
|
test_regex_fallback()
|
||||||
|
print()
|
||||||
|
test_json_validation_for_address()
|
||||||
|
print()
|
||||||
|
test_address_component_extraction()
|
||||||
|
print()
|
||||||
|
test_address_masking()
|
||||||
Loading…
Reference in New Issue