2025-08-20 02:20:42 +00:00
2 changed files with 56 additions and 7 deletions
--- a/backend/app/core/document_handlers/ner_processor.py
+++ b/backend/app/core/document_handlers/ner_processor.py
@ -7,7 +7,7 @@ from ...core.config import settings
 from ..utils.json_extractor import LLMJsonExtractor
 from ..utils.llm_validator import LLMResponseValidator
 import re
-from .regs.entity_regex import extract_id_number_entities, extract_social_credit_code_entities
+from .regs.entity_regex import extract_id_number_entities, extract_social_credit_code_entities, extract_case_number_entities
 from .extractors.ner_extractor import NERExtractor
 from pypinyin import pinyin, Style
@ -722,7 +722,8 @@ class NerProcessor:
        regex_entity_extractors = [
            extract_id_number_entities,
-            extract_social_credit_code_entities
+            extract_social_credit_code_entities,
            extract_case_number_entities
        ]
        for extractor in regex_entity_extractors:
            mapping = extractor(chunk)
@ -733,6 +734,38 @@ class NerProcessor:
        return mapping_pipeline
    def build_mapping_regex_only(self, chunk: str) -> list[Dict[str, str]]:
        """
        Build mapping using only regex-based extraction (no NER, no LLM)
        Args:
            chunk: Text chunk to process
        Returns:
            List of entity mappings
        """
        mapping_pipeline = []
        # Use regex-based extraction for IDs, codes, and case numbers
        regex_entity_extractors = [
            extract_id_number_entities,
            extract_social_credit_code_entities,
            extract_case_number_entities
        ]
        for extractor in regex_entity_extractors:
            mapping = extractor(chunk)
            if mapping and LLMResponseValidator.validate_regex_entity(mapping):
                mapping_pipeline.append(mapping)
                logger.info(f"Regex extraction: Added mapping from {extractor.__name__}")
            elif mapping:
                logger.warning(f"Invalid regex entity mapping format: {mapping}")
            else:
                logger.debug(f"No entities found by {extractor.__name__}")
        logger.info(f"Regex-only extraction: Found {len(mapping_pipeline)} mappings")
        return mapping_pipeline
    def build_mapping_llm_only(self, chunk: str) -> list[Dict[str, str]]:
        """
        Build mapping using only LLM (no NER)
@ -758,10 +791,11 @@ class NerProcessor:
            if mapping:
                mapping_pipeline.append(mapping)
-        # Include regex-based extraction for IDs and codes
+        # Include regex-based extraction for IDs, codes, and case numbers
        regex_entity_extractors = [
            extract_id_number_entities,
-            extract_social_credit_code_entities
+            extract_social_credit_code_entities,
            extract_case_number_entities
        ]
        for extractor in regex_entity_extractors:
            mapping = extractor(chunk)
@ -792,10 +826,11 @@ class NerProcessor:
            mapping_pipeline.append(ner_mapping)
            logger.info(f"NER-only extraction: Added {len(ner_entities)} entities")
-        # Still include regex-based extraction for IDs and codes
+        # Still include regex-based extraction for IDs, codes, and case numbers
        regex_entity_extractors = [
            extract_id_number_entities,
-            extract_social_credit_code_entities
+            extract_social_credit_code_entities,
            extract_case_number_entities
        ]
        for extractor in regex_entity_extractors:
            mapping = extractor(chunk)
@ -1179,7 +1214,12 @@ class NerProcessor:
            chunk_mappings.append(ner_mapping)
            logger.info(f"Added {len(ner_entities)} NER entities to mappings")
-        logger.info(f"Final chunk mappings: {chunk_mappings}")
+        logger.info(f"NER-only mappings: {chunk_mappings}")
        regex_mapping = self.build_mapping_regex_only(merged_text)
        logger.info(f"Regex mapping: {regex_mapping}")
        chunk_mappings.extend(regex_mapping)
        unique_entities = self._merge_entity_mappings(chunk_mappings)
        logger.info(f"Unique entities: {unique_entities}")
--- a/backend/app/core/document_handlers/regs/entity_regex.py
+++ b/backend/app/core/document_handlers/regs/entity_regex.py
@ -16,3 +16,12 @@ def extract_social_credit_code_entities(chunk: str) -> dict:
    for match in re.findall(credit_pattern, chunk):
        entities.append({"text": match, "type": "统一社会信用代码"})
    return {"entities": entities} if entities else {}
 def extract_case_number_entities(chunk: str) -> dict:
    """Extract case numbers and return in entity mapping format."""
    # Pattern for Chinese case numbers: (2022)京 03 民终 3852 号, （2020）京0105 民初69754 号
    case_pattern = r'[（(]\d{4}[）)][^\d]*\d+[^\d]*\d+[^\d]*号'
    entities = []
    for match in re.findall(case_pattern, chunk):
        entities.append({"text": match, "type": "案号"})
    return {"entities": entities} if entities else {}