feat: 过滤掉置信度低的entity

2025-08-19 17:26:30 +08:00 · 2025-08-19 17:26:30 +08:00 · ffa31d33de
parent 24f452818a
commit ffa31d33de
2 changed files with 24 additions and 13 deletions
--- a/backend/app/core/document_handlers/extractors/ner_extractor.py
+++ b/backend/app/core/document_handlers/extractors/ner_extractor.py
@ -19,6 +19,7 @@ class NERExtractor(BaseExtractor):
        self.model = None
        self.ner_pipeline = None
        self._model_initialized = False
+        self.confidence_threshold = 0.95
        
        # Map CLUENER model labels to our desired categories
        self.label_map = {
@ -103,7 +104,9 @@ class NERExtractor(BaseExtractor):
        """
        try:
            # Run the NER pipeline - it handles truncation automatically
+            logger.info(f"Running NER pipeline with text: {text}")
            results = self.ner_pipeline(text)
+            logger.info(f"NER results: {results}")
            
            # Filter and process entities
            filtered_entities = []
@ -119,14 +122,21 @@ class NERExtractor(BaseExtractor):
                    # Clean up the tokenized text (remove spaces between Chinese characters)
                    cleaned_text = self._clean_tokenized_text(entity_text)
                    
-                    # Add to our list with both original and cleaned text
-                    filtered_entities.append({
-                        "text": cleaned_text,  # Clean text for display/processing
-                        "tokenized_text": entity_text,  # Original tokenized text from model
-                        "type": entity_type,
-                        "confidence": confidence_score
-                    })
-            
+                    # Add to our list with both original and cleaned text, only add if confidence score is above threshold
+                    # if entity_group is 'address' or 'company', and only has characters less then 3, then filter it out
+                    if confidence_score > self.confidence_threshold:
+                        filtered_entities.append({
+                            "text": cleaned_text,  # Clean text for display/processing
+                            "tokenized_text": entity_text,  # Original tokenized text from model
+                            "type": entity_type,
+                            "entity_group": entity_group,
+                            "confidence": confidence_score
+                        })
+            logger.info(f"Filtered entities: {filtered_entities}")
+            # filter out entities that are less then 3 characters with entity_group is 'address' or 'company'
+            filtered_entities = [entity for entity in filtered_entities if entity['entity_group'] not in ['address', 'company'] or len(entity['text']) > 3]
+            logger.info(f"Final Filtered entities: {filtered_entities}")
+
            return {
                "entities": filtered_entities,
                "total_count": len(filtered_entities)
--- a/backend/app/core/document_handlers/ner_processor.py
+++ b/backend/app/core/document_handlers/ner_processor.py
@ -1018,11 +1018,12 @@ class NerProcessor:
        
        # Process each chunk with LLM for additional entities
        chunk_mappings = []
-        for i, chunk in enumerate(chunks):
-            logger.info(f"Processing chunk {i+1}/{len(chunks)} with LLM")
-            chunk_mapping = self.build_mapping_llm_only(chunk)  # LLM-only processing
-            logger.info(f"Chunk mapping: {chunk_mapping}")
-            chunk_mappings.extend(chunk_mapping)
+        # TODO: 临时关闭LLM处理
+        # for i, chunk in enumerate(chunks):
+        #     logger.info(f"Processing chunk {i+1}/{len(chunks)} with LLM")
+        #     chunk_mapping = self.build_mapping_llm_only(chunk)  # LLM-only processing
+        #     logger.info(f"Chunk mapping: {chunk_mapping}")
+        #     chunk_mappings.extend(chunk_mapping)
        
        # Add NER entities to the mappings
        if ner_entities: