dev #2
|
|
@ -19,6 +19,7 @@ class NERExtractor(BaseExtractor):
|
||||||
self.model = None
|
self.model = None
|
||||||
self.ner_pipeline = None
|
self.ner_pipeline = None
|
||||||
self._model_initialized = False
|
self._model_initialized = False
|
||||||
|
self.confidence_threshold = 0.95
|
||||||
|
|
||||||
# Map CLUENER model labels to our desired categories
|
# Map CLUENER model labels to our desired categories
|
||||||
self.label_map = {
|
self.label_map = {
|
||||||
|
|
@ -103,7 +104,9 @@ class NERExtractor(BaseExtractor):
|
||||||
"""
|
"""
|
||||||
try:
|
try:
|
||||||
# Run the NER pipeline - it handles truncation automatically
|
# Run the NER pipeline - it handles truncation automatically
|
||||||
|
logger.info(f"Running NER pipeline with text: {text}")
|
||||||
results = self.ner_pipeline(text)
|
results = self.ner_pipeline(text)
|
||||||
|
logger.info(f"NER results: {results}")
|
||||||
|
|
||||||
# Filter and process entities
|
# Filter and process entities
|
||||||
filtered_entities = []
|
filtered_entities = []
|
||||||
|
|
@ -119,14 +122,21 @@ class NERExtractor(BaseExtractor):
|
||||||
# Clean up the tokenized text (remove spaces between Chinese characters)
|
# Clean up the tokenized text (remove spaces between Chinese characters)
|
||||||
cleaned_text = self._clean_tokenized_text(entity_text)
|
cleaned_text = self._clean_tokenized_text(entity_text)
|
||||||
|
|
||||||
# Add to our list with both original and cleaned text
|
# Add to our list with both original and cleaned text, only add if confidence score is above threshold
|
||||||
filtered_entities.append({
|
# if entity_group is 'address' or 'company', and only has characters less then 3, then filter it out
|
||||||
"text": cleaned_text, # Clean text for display/processing
|
if confidence_score > self.confidence_threshold:
|
||||||
"tokenized_text": entity_text, # Original tokenized text from model
|
filtered_entities.append({
|
||||||
"type": entity_type,
|
"text": cleaned_text, # Clean text for display/processing
|
||||||
"confidence": confidence_score
|
"tokenized_text": entity_text, # Original tokenized text from model
|
||||||
})
|
"type": entity_type,
|
||||||
|
"entity_group": entity_group,
|
||||||
|
"confidence": confidence_score
|
||||||
|
})
|
||||||
|
logger.info(f"Filtered entities: {filtered_entities}")
|
||||||
|
# filter out entities that are less then 3 characters with entity_group is 'address' or 'company'
|
||||||
|
filtered_entities = [entity for entity in filtered_entities if entity['entity_group'] not in ['address', 'company'] or len(entity['text']) > 3]
|
||||||
|
logger.info(f"Final Filtered entities: {filtered_entities}")
|
||||||
|
|
||||||
return {
|
return {
|
||||||
"entities": filtered_entities,
|
"entities": filtered_entities,
|
||||||
"total_count": len(filtered_entities)
|
"total_count": len(filtered_entities)
|
||||||
|
|
|
||||||
|
|
@ -1018,11 +1018,12 @@ class NerProcessor:
|
||||||
|
|
||||||
# Process each chunk with LLM for additional entities
|
# Process each chunk with LLM for additional entities
|
||||||
chunk_mappings = []
|
chunk_mappings = []
|
||||||
for i, chunk in enumerate(chunks):
|
# TODO: 临时关闭LLM处理
|
||||||
logger.info(f"Processing chunk {i+1}/{len(chunks)} with LLM")
|
# for i, chunk in enumerate(chunks):
|
||||||
chunk_mapping = self.build_mapping_llm_only(chunk) # LLM-only processing
|
# logger.info(f"Processing chunk {i+1}/{len(chunks)} with LLM")
|
||||||
logger.info(f"Chunk mapping: {chunk_mapping}")
|
# chunk_mapping = self.build_mapping_llm_only(chunk) # LLM-only processing
|
||||||
chunk_mappings.extend(chunk_mapping)
|
# logger.info(f"Chunk mapping: {chunk_mapping}")
|
||||||
|
# chunk_mappings.extend(chunk_mapping)
|
||||||
|
|
||||||
# Add NER entities to the mappings
|
# Add NER entities to the mappings
|
||||||
if ner_entities:
|
if ner_entities:
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue