feat: 过滤掉置信度低的entity
This commit is contained in:
parent
24f452818a
commit
ffa31d33de
|
|
@ -19,6 +19,7 @@ class NERExtractor(BaseExtractor):
|
|||
self.model = None
|
||||
self.ner_pipeline = None
|
||||
self._model_initialized = False
|
||||
self.confidence_threshold = 0.95
|
||||
|
||||
# Map CLUENER model labels to our desired categories
|
||||
self.label_map = {
|
||||
|
|
@ -103,7 +104,9 @@ class NERExtractor(BaseExtractor):
|
|||
"""
|
||||
try:
|
||||
# Run the NER pipeline - it handles truncation automatically
|
||||
logger.info(f"Running NER pipeline with text: {text}")
|
||||
results = self.ner_pipeline(text)
|
||||
logger.info(f"NER results: {results}")
|
||||
|
||||
# Filter and process entities
|
||||
filtered_entities = []
|
||||
|
|
@ -119,14 +122,21 @@ class NERExtractor(BaseExtractor):
|
|||
# Clean up the tokenized text (remove spaces between Chinese characters)
|
||||
cleaned_text = self._clean_tokenized_text(entity_text)
|
||||
|
||||
# Add to our list with both original and cleaned text
|
||||
filtered_entities.append({
|
||||
"text": cleaned_text, # Clean text for display/processing
|
||||
"tokenized_text": entity_text, # Original tokenized text from model
|
||||
"type": entity_type,
|
||||
"confidence": confidence_score
|
||||
})
|
||||
|
||||
# Add to our list with both original and cleaned text, only add if confidence score is above threshold
|
||||
# if entity_group is 'address' or 'company', and only has characters less then 3, then filter it out
|
||||
if confidence_score > self.confidence_threshold:
|
||||
filtered_entities.append({
|
||||
"text": cleaned_text, # Clean text for display/processing
|
||||
"tokenized_text": entity_text, # Original tokenized text from model
|
||||
"type": entity_type,
|
||||
"entity_group": entity_group,
|
||||
"confidence": confidence_score
|
||||
})
|
||||
logger.info(f"Filtered entities: {filtered_entities}")
|
||||
# filter out entities that are less then 3 characters with entity_group is 'address' or 'company'
|
||||
filtered_entities = [entity for entity in filtered_entities if entity['entity_group'] not in ['address', 'company'] or len(entity['text']) > 3]
|
||||
logger.info(f"Final Filtered entities: {filtered_entities}")
|
||||
|
||||
return {
|
||||
"entities": filtered_entities,
|
||||
"total_count": len(filtered_entities)
|
||||
|
|
|
|||
|
|
@ -1018,11 +1018,12 @@ class NerProcessor:
|
|||
|
||||
# Process each chunk with LLM for additional entities
|
||||
chunk_mappings = []
|
||||
for i, chunk in enumerate(chunks):
|
||||
logger.info(f"Processing chunk {i+1}/{len(chunks)} with LLM")
|
||||
chunk_mapping = self.build_mapping_llm_only(chunk) # LLM-only processing
|
||||
logger.info(f"Chunk mapping: {chunk_mapping}")
|
||||
chunk_mappings.extend(chunk_mapping)
|
||||
# TODO: 临时关闭LLM处理
|
||||
# for i, chunk in enumerate(chunks):
|
||||
# logger.info(f"Processing chunk {i+1}/{len(chunks)} with LLM")
|
||||
# chunk_mapping = self.build_mapping_llm_only(chunk) # LLM-only processing
|
||||
# logger.info(f"Chunk mapping: {chunk_mapping}")
|
||||
# chunk_mappings.extend(chunk_mapping)
|
||||
|
||||
# Add NER entities to the mappings
|
||||
if ner_entities:
|
||||
|
|
|
|||
Loading…
Reference in New Issue