feat: 过滤掉置信度低的entity

This commit is contained in:
tigerenwork 2025-08-19 17:26:30 +08:00
parent 24f452818a
commit ffa31d33de
2 changed files with 24 additions and 13 deletions

View File

@ -19,6 +19,7 @@ class NERExtractor(BaseExtractor):
self.model = None
self.ner_pipeline = None
self._model_initialized = False
self.confidence_threshold = 0.95
# Map CLUENER model labels to our desired categories
self.label_map = {
@ -103,7 +104,9 @@ class NERExtractor(BaseExtractor):
"""
try:
# Run the NER pipeline - it handles truncation automatically
logger.info(f"Running NER pipeline with text: {text}")
results = self.ner_pipeline(text)
logger.info(f"NER results: {results}")
# Filter and process entities
filtered_entities = []
@ -119,14 +122,21 @@ class NERExtractor(BaseExtractor):
# Clean up the tokenized text (remove spaces between Chinese characters)
cleaned_text = self._clean_tokenized_text(entity_text)
# Add to our list with both original and cleaned text
filtered_entities.append({
"text": cleaned_text, # Clean text for display/processing
"tokenized_text": entity_text, # Original tokenized text from model
"type": entity_type,
"confidence": confidence_score
})
# Add to our list with both original and cleaned text, only add if confidence score is above threshold
# if entity_group is 'address' or 'company', and only has characters less then 3, then filter it out
if confidence_score > self.confidence_threshold:
filtered_entities.append({
"text": cleaned_text, # Clean text for display/processing
"tokenized_text": entity_text, # Original tokenized text from model
"type": entity_type,
"entity_group": entity_group,
"confidence": confidence_score
})
logger.info(f"Filtered entities: {filtered_entities}")
# filter out entities that are less then 3 characters with entity_group is 'address' or 'company'
filtered_entities = [entity for entity in filtered_entities if entity['entity_group'] not in ['address', 'company'] or len(entity['text']) > 3]
logger.info(f"Final Filtered entities: {filtered_entities}")
return {
"entities": filtered_entities,
"total_count": len(filtered_entities)

View File

@ -1018,11 +1018,12 @@ class NerProcessor:
# Process each chunk with LLM for additional entities
chunk_mappings = []
for i, chunk in enumerate(chunks):
logger.info(f"Processing chunk {i+1}/{len(chunks)} with LLM")
chunk_mapping = self.build_mapping_llm_only(chunk) # LLM-only processing
logger.info(f"Chunk mapping: {chunk_mapping}")
chunk_mappings.extend(chunk_mapping)
# TODO: 临时关闭LLM处理
# for i, chunk in enumerate(chunks):
# logger.info(f"Processing chunk {i+1}/{len(chunks)} with LLM")
# chunk_mapping = self.build_mapping_llm_only(chunk) # LLM-only processing
# logger.info(f"Chunk mapping: {chunk_mapping}")
# chunk_mappings.extend(chunk_mapping)
# Add NER entities to the mappings
if ner_entities: