2025-08-20 02:20:42 +00:00
2 changed files with 24 additions and 13 deletions
--- a/backend/app/core/document_handlers/extractors/ner_extractor.py
+++ b/backend/app/core/document_handlers/extractors/ner_extractor.py
@ -19,6 +19,7 @@ class NERExtractor(BaseExtractor):
        self.model = None
        self.ner_pipeline = None
        self._model_initialized = False
        self.confidence_threshold = 0.95
        # Map CLUENER model labels to our desired categories
        self.label_map = {
@ -103,7 +104,9 @@ class NERExtractor(BaseExtractor):
        """
        try:
            # Run the NER pipeline - it handles truncation automatically
            logger.info(f"Running NER pipeline with text: {text}")
            results = self.ner_pipeline(text)
            logger.info(f"NER results: {results}")
            # Filter and process entities
            filtered_entities = []
@ -119,14 +122,21 @@ class NERExtractor(BaseExtractor):
                    # Clean up the tokenized text (remove spaces between Chinese characters)
                    cleaned_text = self._clean_tokenized_text(entity_text)
-                    # Add to our list with both original and cleaned text
+                    # Add to our list with both original and cleaned text, only add if confidence score is above threshold
-                    filtered_entities.append({
+                    # if entity_group is 'address' or 'company', and only has characters less then 3, then filter it out
-                        "text": cleaned_text,  # Clean text for display/processing
+                    if confidence_score > self.confidence_threshold:
-                        "tokenized_text": entity_text,  # Original tokenized text from model
+                        filtered_entities.append({
-                        "type": entity_type,
+                            "text": cleaned_text,  # Clean text for display/processing
-                        "confidence": confidence_score
+                            "tokenized_text": entity_text,  # Original tokenized text from model
-                    })
+                            "type": entity_type,
-            
+                            "entity_group": entity_group,
                            "confidence": confidence_score
                        })
            logger.info(f"Filtered entities: {filtered_entities}")
            # filter out entities that are less then 3 characters with entity_group is 'address' or 'company'
            filtered_entities = [entity for entity in filtered_entities if entity['entity_group'] not in ['address', 'company'] or len(entity['text']) > 3]
            logger.info(f"Final Filtered entities: {filtered_entities}")
            return {
                "entities": filtered_entities,
                "total_count": len(filtered_entities)
--- a/backend/app/core/document_handlers/ner_processor.py
+++ b/backend/app/core/document_handlers/ner_processor.py
@ -1018,11 +1018,12 @@ class NerProcessor:
        # Process each chunk with LLM for additional entities
        chunk_mappings = []
-        for i, chunk in enumerate(chunks):
+        # TODO: 临时关闭LLM处理
-            logger.info(f"Processing chunk {i+1}/{len(chunks)} with LLM")
+        # for i, chunk in enumerate(chunks):
-            chunk_mapping = self.build_mapping_llm_only(chunk)  # LLM-only processing
+        #     logger.info(f"Processing chunk {i+1}/{len(chunks)} with LLM")
-            logger.info(f"Chunk mapping: {chunk_mapping}")
+        #     chunk_mapping = self.build_mapping_llm_only(chunk)  # LLM-only processing
-            chunk_mappings.extend(chunk_mapping)
+        #     logger.info(f"Chunk mapping: {chunk_mapping}")
        #     chunk_mappings.extend(chunk_mapping)
        # Add NER entities to the mappings
        if ner_entities: