dev #2

Merged
tigeren merged 21 commits from dev into main 2025-08-20 02:20:42 +00:00
2 changed files with 32 additions and 5 deletions
Showing only changes of commit 40dd0de1b3 - Show all commits

View File

@ -116,9 +116,13 @@ class NERExtractor(BaseExtractor):
entity_text = entity['word'] entity_text = entity['word']
confidence_score = entity['score'] confidence_score = entity['score']
# Add to our list # Clean up the tokenized text (remove spaces between Chinese characters)
cleaned_text = self._clean_tokenized_text(entity_text)
# Add to our list with both original and cleaned text
filtered_entities.append({ filtered_entities.append({
"text": entity_text, "text": cleaned_text, # Clean text for display/processing
"original_text": entity_text, # Original tokenized text from model
"type": entity_type, "type": entity_type,
"confidence": confidence_score "confidence": confidence_score
}) })
@ -200,6 +204,28 @@ class NERExtractor(BaseExtractor):
logger.error(f"Error during chunked NER processing: {str(e)}") logger.error(f"Error during chunked NER processing: {str(e)}")
raise Exception(f"Chunked NER processing failed: {str(e)}") raise Exception(f"Chunked NER processing failed: {str(e)}")
def _clean_tokenized_text(self, tokenized_text: str) -> str:
"""
Clean up tokenized text by removing spaces between Chinese characters
Args:
tokenized_text: Text with spaces between characters (e.g., "北 京 市")
Returns:
Cleaned text without spaces (e.g., "北京市")
"""
if not tokenized_text:
return tokenized_text
# Remove spaces between Chinese characters
# This handles cases like "北 京 市" -> "北京市"
cleaned = tokenized_text.replace(" ", "")
# Also handle cases where there might be multiple spaces
cleaned = " ".join(cleaned.split())
return cleaned
def get_entity_summary(self, entities: List[Dict[str, Any]]) -> Dict[str, Any]: def get_entity_summary(self, entities: List[Dict[str, Any]]) -> Dict[str, Any]:
""" """
Generate a summary of extracted entities by type Generate a summary of extracted entities by type

View File

@ -627,12 +627,13 @@ class NerProcessor:
for entity in all_entities: for entity in all_entities:
if isinstance(entity, dict) and 'text' in entity: if isinstance(entity, dict) and 'text' in entity:
# Use cleaned text for deduplication
text = entity['text'].strip() text = entity['text'].strip()
if text and text not in seen_texts: if text and text not in seen_texts:
seen_texts.add(text) seen_texts.add(text)
unique_entities.append(entity) unique_entities.append(entity)
elif text and text in seen_texts: elif text and text in seen_texts:
# 暂时记录下可能存在冲突的entity # Log duplicate entities for debugging
logger.info(f"Duplicate entity found: {entity}") logger.info(f"Duplicate entity found: {entity}")
continue continue
@ -691,7 +692,7 @@ class NerProcessor:
masked = ' '.join([n[0] + '***' if n else '' for n in name.split()]) masked = ' '.join([n[0] + '***' if n else '' for n in name.split()])
group_mask_map[name] = masked group_mask_map[name] = masked
for entity in unique_entities: for entity in unique_entities:
text = entity['text'] text = entity['text'] # Use cleaned text for mapping
entity_type = entity.get('type', '') entity_type = entity.get('type', '')
if text in group_mask_map: if text in group_mask_map:
entity_mapping[text] = group_mask_map[text] entity_mapping[text] = group_mask_map[text]