dev #2
|
|
@ -7,7 +7,7 @@ from ...core.config import settings
|
||||||
from ..utils.json_extractor import LLMJsonExtractor
|
from ..utils.json_extractor import LLMJsonExtractor
|
||||||
from ..utils.llm_validator import LLMResponseValidator
|
from ..utils.llm_validator import LLMResponseValidator
|
||||||
import re
|
import re
|
||||||
from .regs.entity_regex import extract_id_number_entities, extract_social_credit_code_entities
|
from .regs.entity_regex import extract_id_number_entities, extract_social_credit_code_entities, extract_case_number_entities
|
||||||
from .extractors.ner_extractor import NERExtractor
|
from .extractors.ner_extractor import NERExtractor
|
||||||
from pypinyin import pinyin, Style
|
from pypinyin import pinyin, Style
|
||||||
|
|
||||||
|
|
@ -722,7 +722,8 @@ class NerProcessor:
|
||||||
|
|
||||||
regex_entity_extractors = [
|
regex_entity_extractors = [
|
||||||
extract_id_number_entities,
|
extract_id_number_entities,
|
||||||
extract_social_credit_code_entities
|
extract_social_credit_code_entities,
|
||||||
|
extract_case_number_entities
|
||||||
]
|
]
|
||||||
for extractor in regex_entity_extractors:
|
for extractor in regex_entity_extractors:
|
||||||
mapping = extractor(chunk)
|
mapping = extractor(chunk)
|
||||||
|
|
@ -733,6 +734,38 @@ class NerProcessor:
|
||||||
|
|
||||||
return mapping_pipeline
|
return mapping_pipeline
|
||||||
|
|
||||||
|
def build_mapping_regex_only(self, chunk: str) -> list[Dict[str, str]]:
|
||||||
|
"""
|
||||||
|
Build mapping using only regex-based extraction (no NER, no LLM)
|
||||||
|
|
||||||
|
Args:
|
||||||
|
chunk: Text chunk to process
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
List of entity mappings
|
||||||
|
"""
|
||||||
|
mapping_pipeline = []
|
||||||
|
|
||||||
|
# Use regex-based extraction for IDs, codes, and case numbers
|
||||||
|
regex_entity_extractors = [
|
||||||
|
extract_id_number_entities,
|
||||||
|
extract_social_credit_code_entities,
|
||||||
|
extract_case_number_entities
|
||||||
|
]
|
||||||
|
|
||||||
|
for extractor in regex_entity_extractors:
|
||||||
|
mapping = extractor(chunk)
|
||||||
|
if mapping and LLMResponseValidator.validate_regex_entity(mapping):
|
||||||
|
mapping_pipeline.append(mapping)
|
||||||
|
logger.info(f"Regex extraction: Added mapping from {extractor.__name__}")
|
||||||
|
elif mapping:
|
||||||
|
logger.warning(f"Invalid regex entity mapping format: {mapping}")
|
||||||
|
else:
|
||||||
|
logger.debug(f"No entities found by {extractor.__name__}")
|
||||||
|
|
||||||
|
logger.info(f"Regex-only extraction: Found {len(mapping_pipeline)} mappings")
|
||||||
|
return mapping_pipeline
|
||||||
|
|
||||||
def build_mapping_llm_only(self, chunk: str) -> list[Dict[str, str]]:
|
def build_mapping_llm_only(self, chunk: str) -> list[Dict[str, str]]:
|
||||||
"""
|
"""
|
||||||
Build mapping using only LLM (no NER)
|
Build mapping using only LLM (no NER)
|
||||||
|
|
@ -758,10 +791,11 @@ class NerProcessor:
|
||||||
if mapping:
|
if mapping:
|
||||||
mapping_pipeline.append(mapping)
|
mapping_pipeline.append(mapping)
|
||||||
|
|
||||||
# Include regex-based extraction for IDs and codes
|
# Include regex-based extraction for IDs, codes, and case numbers
|
||||||
regex_entity_extractors = [
|
regex_entity_extractors = [
|
||||||
extract_id_number_entities,
|
extract_id_number_entities,
|
||||||
extract_social_credit_code_entities
|
extract_social_credit_code_entities,
|
||||||
|
extract_case_number_entities
|
||||||
]
|
]
|
||||||
for extractor in regex_entity_extractors:
|
for extractor in regex_entity_extractors:
|
||||||
mapping = extractor(chunk)
|
mapping = extractor(chunk)
|
||||||
|
|
@ -792,10 +826,11 @@ class NerProcessor:
|
||||||
mapping_pipeline.append(ner_mapping)
|
mapping_pipeline.append(ner_mapping)
|
||||||
logger.info(f"NER-only extraction: Added {len(ner_entities)} entities")
|
logger.info(f"NER-only extraction: Added {len(ner_entities)} entities")
|
||||||
|
|
||||||
# Still include regex-based extraction for IDs and codes
|
# Still include regex-based extraction for IDs, codes, and case numbers
|
||||||
regex_entity_extractors = [
|
regex_entity_extractors = [
|
||||||
extract_id_number_entities,
|
extract_id_number_entities,
|
||||||
extract_social_credit_code_entities
|
extract_social_credit_code_entities,
|
||||||
|
extract_case_number_entities
|
||||||
]
|
]
|
||||||
for extractor in regex_entity_extractors:
|
for extractor in regex_entity_extractors:
|
||||||
mapping = extractor(chunk)
|
mapping = extractor(chunk)
|
||||||
|
|
@ -1179,7 +1214,12 @@ class NerProcessor:
|
||||||
chunk_mappings.append(ner_mapping)
|
chunk_mappings.append(ner_mapping)
|
||||||
logger.info(f"Added {len(ner_entities)} NER entities to mappings")
|
logger.info(f"Added {len(ner_entities)} NER entities to mappings")
|
||||||
|
|
||||||
logger.info(f"Final chunk mappings: {chunk_mappings}")
|
logger.info(f"NER-only mappings: {chunk_mappings}")
|
||||||
|
|
||||||
|
regex_mapping = self.build_mapping_regex_only(merged_text)
|
||||||
|
logger.info(f"Regex mapping: {regex_mapping}")
|
||||||
|
chunk_mappings.extend(regex_mapping)
|
||||||
|
|
||||||
|
|
||||||
unique_entities = self._merge_entity_mappings(chunk_mappings)
|
unique_entities = self._merge_entity_mappings(chunk_mappings)
|
||||||
logger.info(f"Unique entities: {unique_entities}")
|
logger.info(f"Unique entities: {unique_entities}")
|
||||||
|
|
|
||||||
|
|
@ -16,3 +16,12 @@ def extract_social_credit_code_entities(chunk: str) -> dict:
|
||||||
for match in re.findall(credit_pattern, chunk):
|
for match in re.findall(credit_pattern, chunk):
|
||||||
entities.append({"text": match, "type": "统一社会信用代码"})
|
entities.append({"text": match, "type": "统一社会信用代码"})
|
||||||
return {"entities": entities} if entities else {}
|
return {"entities": entities} if entities else {}
|
||||||
|
|
||||||
|
def extract_case_number_entities(chunk: str) -> dict:
|
||||||
|
"""Extract case numbers and return in entity mapping format."""
|
||||||
|
# Pattern for Chinese case numbers: (2022)京 03 民终 3852 号, (2020)京0105 民初69754 号
|
||||||
|
case_pattern = r'[((]\d{4}[))][^\d]*\d+[^\d]*\d+[^\d]*号'
|
||||||
|
entities = []
|
||||||
|
for match in re.findall(case_pattern, chunk):
|
||||||
|
entities.append({"text": match, "type": "案号"})
|
||||||
|
return {"entities": entities} if entities else {}
|
||||||
Loading…
Reference in New Issue