feat：优化公司名简化性能

2025-08-19 23:28:56 +08:00 · 2025-08-19 23:28:56 +08:00 · a001c26e8d
parent eb33dc137e
commit a001c26e8d
1 changed files with 166 additions and 18 deletions
--- a/backend/app/core/document_handlers/ner_processor.py
+++ b/backend/app/core/document_handlers/ner_processor.py
@ -863,26 +863,66 @@ class NerProcessor:
        for group in linkage.get('entity_groups', []):
            group_type = group.get('group_type', '')
            entities = group.get('entities', [])
+            
            if '公司' in group_type or 'Company' in group_type:
-                for entity in entities:
-                    # 使用新的公司名称脱敏方法
-                    masked = self._mask_company_name(entity['text'])
-                    group_mask_map[entity['text']] = masked
+                # 🚀 OPTIMIZATION: Find primary entity and mask once
+                primary_entity = self._find_primary_company_entity(entities)
+                if primary_entity:
+                    # Call _mask_company_name only once for the primary entity
+                    primary_masked = self._mask_company_name(primary_entity['text'])
+                    logger.info(f"Masked primary company '{primary_entity['text']}' -> '{primary_masked}'")
+                    
+                    # Use the same masked name for all entities in the group
+                    for entity in entities:
+                        group_mask_map[entity['text']] = primary_masked
+                        logger.debug(f"Applied same mask '{primary_masked}' to '{entity['text']}'")
+                else:
+                    # Fallback: mask each entity individually if no primary found
+                    for entity in entities:
+                        masked = self._mask_company_name(entity['text'])
+                        group_mask_map[entity['text']] = masked
+                        
            elif '人名' in group_type:
-                for entity in entities:
-                    name = entity['text']
-                    if not name:
-                        continue
-                    # 使用新的中文姓名脱敏方法
-                    masked = self._mask_chinese_name(name, surname_counter)
-                    group_mask_map[name] = masked
+                # 🚀 OPTIMIZATION: Find primary entity and mask once
+                primary_entity = self._find_primary_person_entity(entities)
+                if primary_entity:
+                    # Call _mask_chinese_name only once for the primary entity
+                    primary_masked = self._mask_chinese_name(primary_entity['text'], surname_counter)
+                    logger.info(f"Masked primary person '{primary_entity['text']}' -> '{primary_masked}'")
+                    
+                    # Use the same masked name for all entities in the group
+                    for entity in entities:
+                        group_mask_map[entity['text']] = primary_masked
+                        logger.debug(f"Applied same mask '{primary_masked}' to '{entity['text']}'")
+                else:
+                    # Fallback: mask each entity individually if no primary found
+                    for entity in entities:
+                        name = entity['text']
+                        if not name:
+                            continue
+                        masked = self._mask_chinese_name(name, surname_counter)
+                        group_mask_map[name] = masked
+                        
            elif '英文人名' in group_type:
-                for entity in entities:
-                    name = entity['text']
-                    if not name:
-                        continue
-                    masked = ' '.join([n[0] + '***' if n else '' for n in name.split()])
-                    group_mask_map[name] = masked
+                # 🚀 OPTIMIZATION: Find primary entity and mask once
+                primary_entity = self._find_primary_person_entity(entities)
+                if primary_entity:
+                    # Call masking only once for the primary entity
+                    primary_masked = ' '.join([n[0] + '***' if n else '' for n in primary_entity['text'].split()])
+                    logger.info(f"Masked primary English person '{primary_entity['text']}' -> '{primary_masked}'")
+                    
+                    # Use the same masked name for all entities in the group
+                    for entity in entities:
+                        group_mask_map[entity['text']] = primary_masked
+                        logger.debug(f"Applied same mask '{primary_masked}' to '{entity['text']}'")
+                else:
+                    # Fallback: mask each entity individually if no primary found
+                    for entity in entities:
+                        name = entity['text']
+                        if not name:
+                            continue
+                        masked = ' '.join([n[0] + '***' if n else '' for n in name.split()])
+                        group_mask_map[name] = masked
        for entity in unique_entities:
            text = entity['text']  # Use cleaned text for mapping
            entity_type = entity.get('type', '')
@ -958,6 +998,114 @@ class NerProcessor:
                used_masked_names.add(masked)
        return entity_mapping

+    def _find_primary_company_entity(self, entities: List[Dict[str, Any]]) -> Optional[Dict[str, Any]]:
+        """
+        Find the primary company entity from a group of related company entities.
+        
+        Strategy:
+        1. Look for entity marked as 'is_primary': True
+        2. If no primary marked, find the longest/fullest company name
+        3. Prefer entities with '公司名称' type over '公司名称简称'
+        
+        Args:
+            entities: List of company entities in a group
+            
+        Returns:
+            Primary entity or None if not found
+        """
+        if not entities:
+            return None
+        
+        # First, look for explicitly marked primary entity
+        for entity in entities:
+            if entity.get('is_primary', False):
+                logger.debug(f"Found explicitly marked primary company: {entity['text']}")
+                return entity
+        
+        # If no primary marked, find the most complete company name
+        # Prefer entities with '公司名称' type over '公司名称简称'
+        primary_candidates = []
+        secondary_candidates = []
+        
+        for entity in entities:
+            entity_type = entity.get('type', '')
+            if '公司名称' in entity_type and '简称' not in entity_type:
+                primary_candidates.append(entity)
+            else:
+                secondary_candidates.append(entity)
+        
+        # If we have primary candidates, choose the longest one
+        if primary_candidates:
+            primary_entity = max(primary_candidates, key=lambda x: len(x['text']))
+            logger.debug(f"Selected primary company from primary candidates: {primary_entity['text']}")
+            return primary_entity
+        
+        # If no primary candidates, choose the longest from secondary candidates
+        if secondary_candidates:
+            primary_entity = max(secondary_candidates, key=lambda x: len(x['text']))
+            logger.debug(f"Selected primary company from secondary candidates: {primary_entity['text']}")
+            return primary_entity
+        
+        # Fallback: return the longest entity overall
+        primary_entity = max(entities, key=lambda x: len(x['text']))
+        logger.debug(f"Selected primary company by length: {primary_entity['text']}")
+        return primary_entity
+
+    def _find_primary_person_entity(self, entities: List[Dict[str, Any]]) -> Optional[Dict[str, Any]]:
+        """
+        Find the primary person entity from a group of related person entities.
+        
+        Strategy:
+        1. Look for entity marked as 'is_primary': True
+        2. If no primary marked, find the longest/fullest person name
+        3. Prefer entities with '人名' type over '英文人名'
+        
+        Args:
+            entities: List of person entities in a group
+            
+        Returns:
+            Primary entity or None if not found
+        """
+        if not entities:
+            return None
+        
+        # First, look for explicitly marked primary entity
+        for entity in entities:
+            if entity.get('is_primary', False):
+                logger.debug(f"Found explicitly marked primary person: {entity['text']}")
+                return entity
+        
+        # If no primary marked, find the most complete person name
+        # Prefer entities with '人名' type over '英文人名'
+        chinese_candidates = []
+        english_candidates = []
+        
+        for entity in entities:
+            entity_type = entity.get('type', '')
+            if '人名' in entity_type and '英文' not in entity_type:
+                chinese_candidates.append(entity)
+            elif '英文人名' in entity_type:
+                english_candidates.append(entity)
+            else:
+                chinese_candidates.append(entity)  # Default to Chinese
+        
+        # If we have Chinese candidates, choose the longest one
+        if chinese_candidates:
+            primary_entity = max(chinese_candidates, key=lambda x: len(x['text']))
+            logger.debug(f"Selected primary person from Chinese candidates: {primary_entity['text']}")
+            return primary_entity
+        
+        # If no Chinese candidates, choose the longest from English candidates
+        if english_candidates:
+            primary_entity = max(english_candidates, key=lambda x: len(x['text']))
+            logger.debug(f"Selected primary person from English candidates: {primary_entity['text']}")
+            return primary_entity
+        
+        # Fallback: return the longest entity overall
+        primary_entity = max(entities, key=lambda x: len(x['text']))
+        logger.debug(f"Selected primary person by length: {primary_entity['text']}")
+        return primary_entity
+
    def _validate_linkage_format(self, linkage: Dict[str, Any]) -> bool:
        return LLMResponseValidator.validate_entity_linkage(linkage)

@ -965,7 +1113,7 @@ class NerProcessor:
        linkable_entities = []
        for entity in unique_entities:
            entity_type = entity.get('type', '')
-            if any(keyword in entity_type for keyword in ['公司', 'Company', '人名', '英文人名']):
+            if any(keyword in entity_type for keyword in ['公司', '公司名称', 'Company', '人名', '英文人名']):
                linkable_entities.append(entity)
        
        if not linkable_entities: