dev #2
|
|
@ -116,9 +116,13 @@ class NERExtractor(BaseExtractor):
|
||||||
entity_text = entity['word']
|
entity_text = entity['word']
|
||||||
confidence_score = entity['score']
|
confidence_score = entity['score']
|
||||||
|
|
||||||
# Add to our list
|
# Clean up the tokenized text (remove spaces between Chinese characters)
|
||||||
|
cleaned_text = self._clean_tokenized_text(entity_text)
|
||||||
|
|
||||||
|
# Add to our list with both original and cleaned text
|
||||||
filtered_entities.append({
|
filtered_entities.append({
|
||||||
"text": entity_text,
|
"text": cleaned_text, # Clean text for display/processing
|
||||||
|
"original_text": entity_text, # Original tokenized text from model
|
||||||
"type": entity_type,
|
"type": entity_type,
|
||||||
"confidence": confidence_score
|
"confidence": confidence_score
|
||||||
})
|
})
|
||||||
|
|
@ -200,6 +204,28 @@ class NERExtractor(BaseExtractor):
|
||||||
logger.error(f"Error during chunked NER processing: {str(e)}")
|
logger.error(f"Error during chunked NER processing: {str(e)}")
|
||||||
raise Exception(f"Chunked NER processing failed: {str(e)}")
|
raise Exception(f"Chunked NER processing failed: {str(e)}")
|
||||||
|
|
||||||
|
def _clean_tokenized_text(self, tokenized_text: str) -> str:
|
||||||
|
"""
|
||||||
|
Clean up tokenized text by removing spaces between Chinese characters
|
||||||
|
|
||||||
|
Args:
|
||||||
|
tokenized_text: Text with spaces between characters (e.g., "北 京 市")
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Cleaned text without spaces (e.g., "北京市")
|
||||||
|
"""
|
||||||
|
if not tokenized_text:
|
||||||
|
return tokenized_text
|
||||||
|
|
||||||
|
# Remove spaces between Chinese characters
|
||||||
|
# This handles cases like "北 京 市" -> "北京市"
|
||||||
|
cleaned = tokenized_text.replace(" ", "")
|
||||||
|
|
||||||
|
# Also handle cases where there might be multiple spaces
|
||||||
|
cleaned = " ".join(cleaned.split())
|
||||||
|
|
||||||
|
return cleaned
|
||||||
|
|
||||||
def get_entity_summary(self, entities: List[Dict[str, Any]]) -> Dict[str, Any]:
|
def get_entity_summary(self, entities: List[Dict[str, Any]]) -> Dict[str, Any]:
|
||||||
"""
|
"""
|
||||||
Generate a summary of extracted entities by type
|
Generate a summary of extracted entities by type
|
||||||
|
|
|
||||||
|
|
@ -627,12 +627,13 @@ class NerProcessor:
|
||||||
|
|
||||||
for entity in all_entities:
|
for entity in all_entities:
|
||||||
if isinstance(entity, dict) and 'text' in entity:
|
if isinstance(entity, dict) and 'text' in entity:
|
||||||
|
# Use cleaned text for deduplication
|
||||||
text = entity['text'].strip()
|
text = entity['text'].strip()
|
||||||
if text and text not in seen_texts:
|
if text and text not in seen_texts:
|
||||||
seen_texts.add(text)
|
seen_texts.add(text)
|
||||||
unique_entities.append(entity)
|
unique_entities.append(entity)
|
||||||
elif text and text in seen_texts:
|
elif text and text in seen_texts:
|
||||||
# 暂时记录下可能存在冲突的entity
|
# Log duplicate entities for debugging
|
||||||
logger.info(f"Duplicate entity found: {entity}")
|
logger.info(f"Duplicate entity found: {entity}")
|
||||||
continue
|
continue
|
||||||
|
|
||||||
|
|
@ -691,7 +692,7 @@ class NerProcessor:
|
||||||
masked = ' '.join([n[0] + '***' if n else '' for n in name.split()])
|
masked = ' '.join([n[0] + '***' if n else '' for n in name.split()])
|
||||||
group_mask_map[name] = masked
|
group_mask_map[name] = masked
|
||||||
for entity in unique_entities:
|
for entity in unique_entities:
|
||||||
text = entity['text']
|
text = entity['text'] # Use cleaned text for mapping
|
||||||
entity_type = entity.get('type', '')
|
entity_type = entity.get('type', '')
|
||||||
if text in group_mask_map:
|
if text in group_mask_map:
|
||||||
entity_mapping[text] = group_mask_map[text]
|
entity_mapping[text] = group_mask_map[text]
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue