feat: 改进ner chunking

2025-08-19 02:15:05 +08:00 · 2025-08-19 02:15:05 +08:00 · 40dd0de1b3
parent d446ac1854
commit 40dd0de1b3
2 changed files with 32 additions and 5 deletions
--- a/backend/app/core/document_handlers/extractors/ner_extractor.py
+++ b/backend/app/core/document_handlers/extractors/ner_extractor.py
@ -116,9 +116,13 @@ class NERExtractor(BaseExtractor):
                    entity_text = entity['word']
                    confidence_score = entity['score']
                    
-                    # Add to our list
+                    # Clean up the tokenized text (remove spaces between Chinese characters)
+                    cleaned_text = self._clean_tokenized_text(entity_text)
+                    
+                    # Add to our list with both original and cleaned text
                    filtered_entities.append({
-                        "text": entity_text,
+                        "text": cleaned_text,  # Clean text for display/processing
+                        "original_text": entity_text,  # Original tokenized text from model
                        "type": entity_type,
                        "confidence": confidence_score
                    })
@ -200,6 +204,28 @@ class NERExtractor(BaseExtractor):
            logger.error(f"Error during chunked NER processing: {str(e)}")
            raise Exception(f"Chunked NER processing failed: {str(e)}")
    
+    def _clean_tokenized_text(self, tokenized_text: str) -> str:
+        """
+        Clean up tokenized text by removing spaces between Chinese characters
+        
+        Args:
+            tokenized_text: Text with spaces between characters (e.g., "北 京 市")
+            
+        Returns:
+            Cleaned text without spaces (e.g., "北京市")
+        """
+        if not tokenized_text:
+            return tokenized_text
+        
+        # Remove spaces between Chinese characters
+        # This handles cases like "北 京 市" -> "北京市"
+        cleaned = tokenized_text.replace(" ", "")
+        
+        # Also handle cases where there might be multiple spaces
+        cleaned = " ".join(cleaned.split())
+        
+        return cleaned
+    
    def get_entity_summary(self, entities: List[Dict[str, Any]]) -> Dict[str, Any]:
        """
        Generate a summary of extracted entities by type
--- a/backend/app/core/document_handlers/ner_processor.py
+++ b/backend/app/core/document_handlers/ner_processor.py
@ -627,12 +627,13 @@ class NerProcessor:
        
        for entity in all_entities:
            if isinstance(entity, dict) and 'text' in entity:
+                # Use cleaned text for deduplication
                text = entity['text'].strip()
                if text and text not in seen_texts:
                    seen_texts.add(text)
                    unique_entities.append(entity)
-                elif text and  text in seen_texts:
-                    # 暂时记录下可能存在冲突的entity
+                elif text and text in seen_texts:
+                    # Log duplicate entities for debugging
                    logger.info(f"Duplicate entity found: {entity}")
                    continue
        
@ -691,7 +692,7 @@ class NerProcessor:
                    masked = ' '.join([n[0] + '***' if n else '' for n in name.split()])
                    group_mask_map[name] = masked
        for entity in unique_entities:
-            text = entity['text']
+            text = entity['text']  # Use cleaned text for mapping
            entity_type = entity.get('type', '')
            if text in group_mask_map:
                entity_mapping[text] = group_mask_map[text]