feat: 更新替换算法,解决匹配token中有空格的问题

This commit is contained in:
tigerenwork 2025-08-19 16:08:49 +08:00
parent 40dd0de1b3
commit 24f452818a
11 changed files with 1013 additions and 17 deletions

View File

@ -86,7 +86,7 @@ docker-compose build frontend
docker-compose build mineru-api docker-compose build mineru-api
# Build multiple specific services # Build multiple specific services
docker-compose build backend-api frontend docker-compose build backend-api frontend celery-worker
``` ```
### Building and restarting specific services ### Building and restarting specific services

View File

@ -40,17 +40,36 @@ class DocumentProcessor(ABC):
return chunks return chunks
def _apply_mapping(self, text: str, mapping: Dict[str, str]) -> str: def _apply_mapping_with_alignment(self, text: str, mapping: Dict[str, str]) -> str:
"""Apply the mapping to replace sensitive information""" """
masked_text = text Apply the mapping to replace sensitive information using character-by-character alignment.
for original, masked in mapping.items():
if isinstance(masked, dict): This method uses the new alignment-based masking to handle spacing issues
masked = next(iter(masked.values()), "") between NER results and original document text.
elif not isinstance(masked, str):
masked = str(masked) if masked is not None else "" Args:
masked_text = masked_text.replace(original, masked) text: Original document text
mapping: Dictionary mapping original entity text to masked text
Returns:
Masked document text
"""
logger.info(f"Applying entity mapping with alignment to text of length {len(text)}")
logger.debug(f"Entity mapping: {mapping}")
# Use the new alignment-based masking method
masked_text = self.ner_processor.apply_entity_masking_with_alignment(text, mapping)
logger.info("Successfully applied entity masking with alignment")
return masked_text return masked_text
def _apply_mapping(self, text: str, mapping: Dict[str, str]) -> str:
"""
Legacy method for simple string replacement.
Now delegates to the new alignment-based method.
"""
return self._apply_mapping_with_alignment(text, mapping)
def process_content(self, content: str) -> str: def process_content(self, content: str) -> str:
"""Process document content by masking sensitive information""" """Process document content by masking sensitive information"""
sentences = content.split("") sentences = content.split("")
@ -59,9 +78,11 @@ class DocumentProcessor(ABC):
logger.info(f"Split content into {len(chunks)} chunks") logger.info(f"Split content into {len(chunks)} chunks")
final_mapping = self.ner_processor.process(chunks) final_mapping = self.ner_processor.process(chunks)
logger.info(f"Generated entity mapping with {len(final_mapping)} entities")
masked_content = self._apply_mapping(content, final_mapping) # Use the new alignment-based masking
logger.info("Successfully masked content") masked_content = self._apply_mapping_with_alignment(content, final_mapping)
logger.info("Successfully masked content using character alignment")
return masked_content return masked_content

View File

@ -122,7 +122,7 @@ class NERExtractor(BaseExtractor):
# Add to our list with both original and cleaned text # Add to our list with both original and cleaned text
filtered_entities.append({ filtered_entities.append({
"text": cleaned_text, # Clean text for display/processing "text": cleaned_text, # Clean text for display/processing
"original_text": entity_text, # Original tokenized text from model "tokenized_text": entity_text, # Original tokenized text from model
"type": entity_type, "type": entity_type,
"confidence": confidence_score "confidence": confidence_score
}) })

View File

@ -9,7 +9,7 @@ from .maskers.company_masker import CompanyMasker
from .maskers.address_masker import AddressMasker from .maskers.address_masker import AddressMasker
from .maskers.id_masker import IDMasker from .maskers.id_masker import IDMasker
from .maskers.case_masker import CaseMasker from .maskers.case_masker import CaseMasker
from ...services.ollama_client import OllamaClient from ..services.ollama_client import OllamaClient
class MaskerFactory: class MaskerFactory:

View File

@ -1,4 +1,4 @@
from typing import Any, Dict, List from typing import Any, Dict, List, Tuple, Optional
from ..prompts.masking_prompts import get_ner_name_prompt, get_ner_company_prompt, get_ner_address_prompt, get_ner_project_prompt, get_ner_case_number_prompt, get_entity_linkage_prompt from ..prompts.masking_prompts import get_ner_name_prompt, get_ner_company_prompt, get_ner_address_prompt, get_ner_project_prompt, get_ner_case_number_prompt, get_entity_linkage_prompt
import logging import logging
import json import json
@ -20,9 +20,201 @@ class NerProcessor:
# Initialize NER extractor for ML-based entity extraction # Initialize NER extractor for ML-based entity extraction
self.ner_extractor = NERExtractor() self.ner_extractor = NERExtractor()
def _find_entity_alignment(self, entity_text: str, original_document_text: str) -> Optional[Tuple[int, int, str]]:
"""
Find entity in original document using character-by-character alignment.
This method handles the case where the original document may have spaces
that are not from tokenization, and the entity text may have different
spacing patterns.
Args:
entity_text: The entity text to find (may have spaces from tokenization)
original_document_text: The original document text (may have spaces)
Returns:
Tuple of (start_pos, end_pos, found_text) or None if not found
"""
# Remove all spaces from entity text to get clean characters
clean_entity = entity_text.replace(" ", "")
# Create character lists ignoring spaces from both entity and document
entity_chars = [c for c in clean_entity]
doc_chars = [c for c in original_document_text if c != ' ']
# Find the sequence in document characters
for i in range(len(doc_chars) - len(entity_chars) + 1):
if doc_chars[i:i+len(entity_chars)] == entity_chars:
# Found match, now map back to original positions
return self._map_char_positions_to_original(i, len(entity_chars), original_document_text)
return None
def _map_char_positions_to_original(self, clean_start: int, entity_length: int, original_text: str) -> Tuple[int, int, str]:
"""
Map positions from clean text (without spaces) back to original text positions.
Args:
clean_start: Start position in clean text (without spaces)
entity_length: Length of entity in characters
original_text: Original document text with spaces
Returns:
Tuple of (start_pos, end_pos, found_text) in original text
"""
original_pos = 0
clean_pos = 0
# Find the start position in original text
while clean_pos < clean_start and original_pos < len(original_text):
if original_text[original_pos] != ' ':
clean_pos += 1
original_pos += 1
start_pos = original_pos
# Find the end position by counting non-space characters
chars_found = 0
while chars_found < entity_length and original_pos < len(original_text):
if original_text[original_pos] != ' ':
chars_found += 1
original_pos += 1
end_pos = original_pos
# Extract the actual text from the original document
found_text = original_text[start_pos:end_pos]
return start_pos, end_pos, found_text
def _validate_mapping_format(self, mapping: Dict[str, Any]) -> bool: def _validate_mapping_format(self, mapping: Dict[str, Any]) -> bool:
return LLMResponseValidator.validate_entity_extraction(mapping) return LLMResponseValidator.validate_entity_extraction(mapping)
def apply_entity_masking_with_alignment(self, original_document_text: str, entity_mapping: Dict[str, str], mask_char: str = "*") -> str:
"""
Apply entity masking to original document text using character-by-character alignment.
This method finds each entity in the original document using alignment and
replaces it with the corresponding masked version. It handles multiple
occurrences of the same entity by finding all instances before moving
to the next entity.
Args:
original_document_text: The original document text to mask
entity_mapping: Dictionary mapping original entity text to masked text
mask_char: Character to use for masking (default: "*")
Returns:
Masked document text
"""
masked_document = original_document_text
# Sort entities by length (longest first) to avoid partial matches
sorted_entities = sorted(entity_mapping.keys(), key=len, reverse=True)
for entity_text in sorted_entities:
masked_text = entity_mapping[entity_text]
# Skip if masked text is the same as original text (prevents infinite loop)
if entity_text == masked_text:
logger.debug(f"Skipping entity '{entity_text}' as masked text is identical")
continue
# Find ALL occurrences of this entity in the document
# We need to loop until no more matches are found
# Add safety counter to prevent infinite loops
max_iterations = 100 # Safety limit
iteration_count = 0
while iteration_count < max_iterations:
iteration_count += 1
# Find the entity in the current masked document using alignment
alignment_result = self._find_entity_alignment(entity_text, masked_document)
if alignment_result:
start_pos, end_pos, found_text = alignment_result
# Replace the found text with the masked version
masked_document = (
masked_document[:start_pos] +
masked_text +
masked_document[end_pos:]
)
logger.debug(f"Masked entity '{entity_text}' -> '{masked_text}' at positions {start_pos}-{end_pos} (iteration {iteration_count})")
else:
# No more occurrences found for this entity, move to next entity
logger.debug(f"No more occurrences of '{entity_text}' found in document after {iteration_count} iterations")
break
# Log warning if we hit the safety limit
if iteration_count >= max_iterations:
logger.warning(f"Reached maximum iterations ({max_iterations}) for entity '{entity_text}', stopping to prevent infinite loop")
return masked_document
def test_character_alignment(self) -> None:
"""
Test method to demonstrate character-by-character alignment functionality.
This method can be used to validate the alignment works correctly with
various spacing patterns.
"""
test_cases = [
# Test case 1: Entity with spaces, document without spaces
{
"entity_text": "李 淼",
"document_text": "上诉人李淼因合同纠纷",
"expected_found": "李淼"
},
# Test case 2: Entity without spaces, document with spaces
{
"entity_text": "邓青菁",
"document_text": "上诉人邓 青 菁因合同纠纷",
"expected_found": "邓 青 菁"
},
# Test case 3: Both entity and document have spaces
{
"entity_text": "王 欢 子",
"document_text": "法定代表人王 欢 子,总经理",
"expected_found": "王 欢 子"
},
# Test case 4: Entity without spaces, document without spaces
{
"entity_text": "郭东军",
"document_text": "法定代表人郭东军,执行董事",
"expected_found": "郭东军"
},
# Test case 5: Complex company name
{
"entity_text": "北京丰复久信营销科技有限公司",
"document_text": "上诉人(原审原告):北京 丰复久信 营销科技 有限公司",
"expected_found": "北京 丰复久信 营销科技 有限公司"
}
]
logger.info("Testing character-by-character alignment...")
for i, test_case in enumerate(test_cases, 1):
entity_text = test_case["entity_text"]
document_text = test_case["document_text"]
expected_found = test_case["expected_found"]
result = self._find_entity_alignment(entity_text, document_text)
if result:
start_pos, end_pos, found_text = result
success = found_text == expected_found
status = "✓ PASS" if success else "✗ FAIL"
logger.info(f"Test {i} {status}: Entity '{entity_text}' -> Found '{found_text}' (expected '{expected_found}') at positions {start_pos}-{end_pos}")
if not success:
logger.error(f" Expected: '{expected_found}', Got: '{found_text}'")
else:
logger.error(f"Test {i} ✗ FAIL: Entity '{entity_text}' not found in document")
logger.info("Character alignment testing completed.")
def extract_entities_with_ner(self, text: str) -> List[Dict[str, Any]]: def extract_entities_with_ner(self, text: str) -> List[Dict[str, Any]]:
""" """
Extract entities using the NER model Extract entities using the NER model

View File

@ -3,7 +3,7 @@ Refactored NerProcessor using the new masker architecture.
""" """
import logging import logging
from typing import Any, Dict, List, Optional from typing import Any, Dict, List, Optional, Tuple
from ..prompts.masking_prompts import ( from ..prompts.masking_prompts import (
get_ner_name_prompt, get_ner_company_prompt, get_ner_address_prompt, get_ner_name_prompt, get_ner_company_prompt, get_ner_address_prompt,
get_ner_project_prompt, get_ner_case_number_prompt, get_entity_linkage_prompt get_ner_project_prompt, get_ner_case_number_prompt, get_entity_linkage_prompt
@ -28,6 +28,137 @@ class NerProcessorRefactored:
self.maskers = self._initialize_maskers() self.maskers = self._initialize_maskers()
self.surname_counter = {} # Shared counter for Chinese names self.surname_counter = {} # Shared counter for Chinese names
def _find_entity_alignment(self, entity_text: str, original_document_text: str) -> Optional[Tuple[int, int, str]]:
"""
Find entity in original document using character-by-character alignment.
This method handles the case where the original document may have spaces
that are not from tokenization, and the entity text may have different
spacing patterns.
Args:
entity_text: The entity text to find (may have spaces from tokenization)
original_document_text: The original document text (may have spaces)
Returns:
Tuple of (start_pos, end_pos, found_text) or None if not found
"""
# Remove all spaces from entity text to get clean characters
clean_entity = entity_text.replace(" ", "")
# Create character lists ignoring spaces from both entity and document
entity_chars = [c for c in clean_entity]
doc_chars = [c for c in original_document_text if c != ' ']
# Find the sequence in document characters
for i in range(len(doc_chars) - len(entity_chars) + 1):
if doc_chars[i:i+len(entity_chars)] == entity_chars:
# Found match, now map back to original positions
return self._map_char_positions_to_original(i, len(entity_chars), original_document_text)
return None
def _map_char_positions_to_original(self, clean_start: int, entity_length: int, original_text: str) -> Tuple[int, int, str]:
"""
Map positions from clean text (without spaces) back to original text positions.
Args:
clean_start: Start position in clean text (without spaces)
entity_length: Length of entity in characters
original_text: Original document text with spaces
Returns:
Tuple of (start_pos, end_pos, found_text) in original text
"""
original_pos = 0
clean_pos = 0
# Find the start position in original text
while clean_pos < clean_start and original_pos < len(original_text):
if original_text[original_pos] != ' ':
clean_pos += 1
original_pos += 1
start_pos = original_pos
# Find the end position by counting non-space characters
chars_found = 0
while chars_found < entity_length and original_pos < len(original_text):
if original_text[original_pos] != ' ':
chars_found += 1
original_pos += 1
end_pos = original_pos
# Extract the actual text from the original document
found_text = original_text[start_pos:end_pos]
return start_pos, end_pos, found_text
def apply_entity_masking_with_alignment(self, original_document_text: str, entity_mapping: Dict[str, str], mask_char: str = "*") -> str:
"""
Apply entity masking to original document text using character-by-character alignment.
This method finds each entity in the original document using alignment and
replaces it with the corresponding masked version. It handles multiple
occurrences of the same entity by finding all instances before moving
to the next entity.
Args:
original_document_text: The original document text to mask
entity_mapping: Dictionary mapping original entity text to masked text
mask_char: Character to use for masking (default: "*")
Returns:
Masked document text
"""
masked_document = original_document_text
# Sort entities by length (longest first) to avoid partial matches
sorted_entities = sorted(entity_mapping.keys(), key=len, reverse=True)
for entity_text in sorted_entities:
masked_text = entity_mapping[entity_text]
# Skip if masked text is the same as original text (prevents infinite loop)
if entity_text == masked_text:
logger.debug(f"Skipping entity '{entity_text}' as masked text is identical")
continue
# Find ALL occurrences of this entity in the document
# We need to loop until no more matches are found
# Add safety counter to prevent infinite loops
max_iterations = 100 # Safety limit
iteration_count = 0
while iteration_count < max_iterations:
iteration_count += 1
# Find the entity in the current masked document using alignment
alignment_result = self._find_entity_alignment(entity_text, masked_document)
if alignment_result:
start_pos, end_pos, found_text = alignment_result
# Replace the found text with the masked version
masked_document = (
masked_document[:start_pos] +
masked_text +
masked_document[end_pos:]
)
logger.debug(f"Masked entity '{entity_text}' -> '{masked_text}' at positions {start_pos}-{end_pos} (iteration {iteration_count})")
else:
# No more occurrences found for this entity, move to next entity
logger.debug(f"No more occurrences of '{entity_text}' found in document after {iteration_count} iterations")
break
# Log warning if we hit the safety limit
if iteration_count >= max_iterations:
logger.warning(f"Reached maximum iterations ({max_iterations}) for entity '{entity_text}', stopping to prevent infinite loop")
return masked_document
def _initialize_maskers(self) -> Dict[str, BaseMasker]: def _initialize_maskers(self) -> Dict[str, BaseMasker]:
"""Initialize all maskers""" """Initialize all maskers"""
maskers = {} maskers = {}

View File

@ -0,0 +1,130 @@
#!/usr/bin/env python3
"""
Debug script to understand the position mapping issue after masking.
"""
def find_entity_alignment(entity_text: str, original_document_text: str):
"""Simplified version of the alignment method for testing"""
clean_entity = entity_text.replace(" ", "")
doc_chars = [c for c in original_document_text if c != ' ']
for i in range(len(doc_chars) - len(clean_entity) + 1):
if doc_chars[i:i+len(clean_entity)] == list(clean_entity):
return map_char_positions_to_original(i, len(clean_entity), original_document_text)
return None
def map_char_positions_to_original(clean_start: int, entity_length: int, original_text: str):
"""Simplified version of position mapping for testing"""
original_pos = 0
clean_pos = 0
while clean_pos < clean_start and original_pos < len(original_text):
if original_text[original_pos] != ' ':
clean_pos += 1
original_pos += 1
start_pos = original_pos
chars_found = 0
while chars_found < entity_length and original_pos < len(original_text):
if original_text[original_pos] != ' ':
chars_found += 1
original_pos += 1
end_pos = original_pos
found_text = original_text[start_pos:end_pos]
return start_pos, end_pos, found_text
def debug_position_issue():
"""Debug the position mapping issue"""
print("Debugging Position Mapping Issue")
print("=" * 50)
# Test document
original_doc = "上诉人李淼因合同纠纷,法定代表人李淼,委托代理人李淼。"
entity = "李淼"
masked_text = "李M"
print(f"Original document: '{original_doc}'")
print(f"Entity to mask: '{entity}'")
print(f"Masked text: '{masked_text}'")
print()
# First occurrence
print("=== First Occurrence ===")
result1 = find_entity_alignment(entity, original_doc)
if result1:
start1, end1, found1 = result1
print(f"Found at positions {start1}-{end1}: '{found1}'")
# Apply first mask
masked_doc = original_doc[:start1] + masked_text + original_doc[end1:]
print(f"After first mask: '{masked_doc}'")
print(f"Length changed from {len(original_doc)} to {len(masked_doc)}")
# Try to find second occurrence in the masked document
print("\n=== Second Occurrence (in masked document) ===")
result2 = find_entity_alignment(entity, masked_doc)
if result2:
start2, end2, found2 = result2
print(f"Found at positions {start2}-{end2}: '{found2}'")
# Apply second mask
masked_doc2 = masked_doc[:start2] + masked_text + masked_doc[end2:]
print(f"After second mask: '{masked_doc2}'")
# Try to find third occurrence
print("\n=== Third Occurrence (in double-masked document) ===")
result3 = find_entity_alignment(entity, masked_doc2)
if result3:
start3, end3, found3 = result3
print(f"Found at positions {start3}-{end3}: '{found3}'")
else:
print("No third occurrence found")
else:
print("No second occurrence found")
else:
print("No first occurrence found")
def debug_infinite_loop():
"""Debug the infinite loop issue"""
print("\n" + "=" * 50)
print("Debugging Infinite Loop Issue")
print("=" * 50)
# Test document that causes infinite loop
original_doc = "上诉人李淼因合同纠纷,法定代表人李淼。北京丰复久信营销科技有限公司,丰复久信公司。"
entity = "丰复久信公司"
masked_text = "丰复久信公司" # Same text (no change)
print(f"Original document: '{original_doc}'")
print(f"Entity to mask: '{entity}'")
print(f"Masked text: '{masked_text}' (same as original)")
print()
# This will cause infinite loop because we're replacing with the same text
print("=== This will cause infinite loop ===")
print("Because we're replacing '丰复久信公司' with '丰复久信公司'")
print("The document doesn't change, so we keep finding the same position")
# Show what happens
masked_doc = original_doc
for i in range(3): # Limit to 3 iterations for demo
result = find_entity_alignment(entity, masked_doc)
if result:
start, end, found = result
print(f"Iteration {i+1}: Found at positions {start}-{end}: '{found}'")
# Apply mask (but it's the same text)
masked_doc = masked_doc[:start] + masked_text + masked_doc[end:]
print(f"After mask: '{masked_doc}'")
else:
print(f"Iteration {i+1}: No occurrence found")
break
if __name__ == "__main__":
debug_position_issue()
debug_infinite_loop()

View File

@ -0,0 +1,67 @@
#!/usr/bin/env python3
"""
Test script for character-by-character alignment functionality.
This script demonstrates how the alignment handles different spacing patterns
between entity text and original document text.
"""
import sys
import os
sys.path.append(os.path.join(os.path.dirname(__file__), 'backend'))
from app.core.document_handlers.ner_processor import NerProcessor
def main():
"""Test the character alignment functionality."""
processor = NerProcessor()
print("Testing Character-by-Character Alignment")
print("=" * 50)
# Test the alignment functionality
processor.test_character_alignment()
print("\n" + "=" * 50)
print("Testing Entity Masking with Alignment")
print("=" * 50)
# Test entity masking with alignment
original_document = "上诉人原审原告北京丰复久信营销科技有限公司住所地北京市海淀区北小马厂6号1号楼华天大厦1306室。法定代表人郭东军执行董事、经理。委托诉讼代理人周大海北京市康达律师事务所律师。"
# Example entity mapping (from your NER results)
entity_mapping = {
"北京丰复久信营销科技有限公司": "北京JO营销科技有限公司",
"郭东军": "郭DJ",
"周大海": "周DH",
"北京市康达律师事务所": "北京市KD律师事务所"
}
print(f"Original document: {original_document}")
print(f"Entity mapping: {entity_mapping}")
# Apply masking with alignment
masked_document = processor.apply_entity_masking_with_alignment(
original_document,
entity_mapping
)
print(f"Masked document: {masked_document}")
# Test with document that has spaces
print("\n" + "=" * 50)
print("Testing with Document Containing Spaces")
print("=" * 50)
spaced_document = "上诉人(原审原告):北京 丰复久信 营销科技 有限公司住所地北京市海淀区北小马厂6号1号楼华天大厦1306室。法定代表人郭 东 军,执行董事、经理。"
print(f"Spaced document: {spaced_document}")
masked_spaced_document = processor.apply_entity_masking_with_alignment(
spaced_document,
entity_mapping
)
print(f"Masked spaced document: {masked_spaced_document}")
if __name__ == "__main__":
main()

View File

@ -0,0 +1,186 @@
#!/usr/bin/env python3
"""
Final test to verify the fix handles multiple occurrences and prevents infinite loops.
"""
def find_entity_alignment(entity_text: str, original_document_text: str):
"""Simplified version of the alignment method for testing"""
clean_entity = entity_text.replace(" ", "")
doc_chars = [c for c in original_document_text if c != ' ']
for i in range(len(doc_chars) - len(clean_entity) + 1):
if doc_chars[i:i+len(clean_entity)] == list(clean_entity):
return map_char_positions_to_original(i, len(clean_entity), original_document_text)
return None
def map_char_positions_to_original(clean_start: int, entity_length: int, original_text: str):
"""Simplified version of position mapping for testing"""
original_pos = 0
clean_pos = 0
while clean_pos < clean_start and original_pos < len(original_text):
if original_text[original_pos] != ' ':
clean_pos += 1
original_pos += 1
start_pos = original_pos
chars_found = 0
while chars_found < entity_length and original_pos < len(original_text):
if original_text[original_pos] != ' ':
chars_found += 1
original_pos += 1
end_pos = original_pos
found_text = original_text[start_pos:end_pos]
return start_pos, end_pos, found_text
def apply_entity_masking_with_alignment_fixed(original_document_text: str, entity_mapping: dict):
"""Fixed implementation that handles multiple occurrences and prevents infinite loops"""
masked_document = original_document_text
sorted_entities = sorted(entity_mapping.keys(), key=len, reverse=True)
for entity_text in sorted_entities:
masked_text = entity_mapping[entity_text]
# Skip if masked text is the same as original text (prevents infinite loop)
if entity_text == masked_text:
print(f"Skipping entity '{entity_text}' as masked text is identical")
continue
# Find ALL occurrences of this entity in the document
# Add safety counter to prevent infinite loops
max_iterations = 100 # Safety limit
iteration_count = 0
while iteration_count < max_iterations:
iteration_count += 1
# Find the entity in the current masked document using alignment
alignment_result = find_entity_alignment(entity_text, masked_document)
if alignment_result:
start_pos, end_pos, found_text = alignment_result
# Replace the found text with the masked version
masked_document = (
masked_document[:start_pos] +
masked_text +
masked_document[end_pos:]
)
print(f"Masked entity '{entity_text}' -> '{masked_text}' at positions {start_pos}-{end_pos} (iteration {iteration_count})")
else:
# No more occurrences found for this entity, move to next entity
print(f"No more occurrences of '{entity_text}' found in document after {iteration_count} iterations")
break
# Log warning if we hit the safety limit
if iteration_count >= max_iterations:
print(f"WARNING: Reached maximum iterations ({max_iterations}) for entity '{entity_text}', stopping to prevent infinite loop")
return masked_document
def test_final_fix():
"""Test the final fix with various scenarios"""
print("Testing Final Fix for Multiple Occurrences and Infinite Loop Prevention")
print("=" * 70)
# Test case 1: Multiple occurrences of the same entity (should work)
print("\nTest Case 1: Multiple occurrences of same entity")
test_document_1 = "上诉人李淼因合同纠纷,法定代表人李淼,委托代理人李淼。"
entity_mapping_1 = {"李淼": "李M"}
print(f"Original: {test_document_1}")
result_1 = apply_entity_masking_with_alignment_fixed(test_document_1, entity_mapping_1)
print(f"Result: {result_1}")
remaining_1 = result_1.count("李淼")
expected_1 = "上诉人李M因合同纠纷法定代表人李M委托代理人李M。"
if result_1 == expected_1 and remaining_1 == 0:
print("✅ PASS: All occurrences masked correctly")
else:
print(f"❌ FAIL: Expected '{expected_1}', got '{result_1}'")
print(f" Remaining '李淼' occurrences: {remaining_1}")
# Test case 2: Entity with same masked text (should skip to prevent infinite loop)
print("\nTest Case 2: Entity with same masked text (should skip)")
test_document_2 = "上诉人李淼因合同纠纷,法定代表人李淼。北京丰复久信营销科技有限公司,丰复久信公司。"
entity_mapping_2 = {
"李淼": "李M",
"丰复久信公司": "丰复久信公司" # Same text - should be skipped
}
print(f"Original: {test_document_2}")
result_2 = apply_entity_masking_with_alignment_fixed(test_document_2, entity_mapping_2)
print(f"Result: {result_2}")
remaining_2_li = result_2.count("李淼")
remaining_2_company = result_2.count("丰复久信公司")
if remaining_2_li == 0 and remaining_2_company == 1: # Company should remain unmasked
print("✅ PASS: Infinite loop prevented, only different text masked")
else:
print(f"❌ FAIL: Remaining '李淼': {remaining_2_li}, '丰复久信公司': {remaining_2_company}")
# Test case 3: Mixed spacing scenarios
print("\nTest Case 3: Mixed spacing scenarios")
test_document_3 = "上诉人李 淼因合同纠纷,法定代表人李淼,委托代理人李 淼。"
entity_mapping_3 = {"李 淼": "李M", "李淼": "李M"}
print(f"Original: {test_document_3}")
result_3 = apply_entity_masking_with_alignment_fixed(test_document_3, entity_mapping_3)
print(f"Result: {result_3}")
remaining_3 = result_3.count("李淼") + result_3.count("李 淼")
if remaining_3 == 0:
print("✅ PASS: Mixed spacing handled correctly")
else:
print(f"❌ FAIL: Remaining occurrences: {remaining_3}")
# Test case 4: Complex document with real examples
print("\nTest Case 4: Complex document with real examples")
test_document_4 = """上诉人原审原告北京丰复久信营销科技有限公司住所地北京市海淀区北小马厂6号1号楼华天大厦1306室。
法定代表人郭东军执行董事经理
委托诉讼代理人周大海北京市康达律师事务所律师
委托诉讼代理人王乃哲北京市康达律师事务所律师
被上诉人原审被告中研智创区块链技术有限公司住所地天津市津南区双港镇工业园区优谷产业园5号楼-1505
法定代表人王欢子总经理
委托诉讼代理人魏鑫北京市昊衡律师事务所律师"""
entity_mapping_4 = {
"北京丰复久信营销科技有限公司": "北京JO营销科技有限公司",
"郭东军": "郭DJ",
"周大海": "周DH",
"王乃哲": "王NZ",
"中研智创区块链技术有限公司": "中研智创区块链技术有限公司", # Same text - should be skipped
"王欢子": "王HZ",
"魏鑫": "魏X",
"北京市康达律师事务所": "北京市KD律师事务所",
"北京市昊衡律师事务所": "北京市HH律师事务所"
}
print(f"Original length: {len(test_document_4)} characters")
result_4 = apply_entity_masking_with_alignment_fixed(test_document_4, entity_mapping_4)
print(f"Result length: {len(result_4)} characters")
# Check that entities were masked correctly
unmasked_entities = []
for entity in entity_mapping_4.keys():
if entity in result_4 and entity != entity_mapping_4[entity]: # Skip if masked text is same
unmasked_entities.append(entity)
if not unmasked_entities:
print("✅ PASS: All entities masked correctly in complex document")
else:
print(f"❌ FAIL: Unmasked entities: {unmasked_entities}")
print("\n" + "=" * 70)
print("Final Fix Verification Completed!")
if __name__ == "__main__":
test_final_fix()

View File

@ -0,0 +1,173 @@
#!/usr/bin/env python3
"""
Test to verify the fix for multiple occurrence issue in apply_entity_masking_with_alignment.
"""
def find_entity_alignment(entity_text: str, original_document_text: str):
"""Simplified version of the alignment method for testing"""
clean_entity = entity_text.replace(" ", "")
doc_chars = [c for c in original_document_text if c != ' ']
for i in range(len(doc_chars) - len(clean_entity) + 1):
if doc_chars[i:i+len(clean_entity)] == list(clean_entity):
return map_char_positions_to_original(i, len(clean_entity), original_document_text)
return None
def map_char_positions_to_original(clean_start: int, entity_length: int, original_text: str):
"""Simplified version of position mapping for testing"""
original_pos = 0
clean_pos = 0
while clean_pos < clean_start and original_pos < len(original_text):
if original_text[original_pos] != ' ':
clean_pos += 1
original_pos += 1
start_pos = original_pos
chars_found = 0
while chars_found < entity_length and original_pos < len(original_text):
if original_text[original_pos] != ' ':
chars_found += 1
original_pos += 1
end_pos = original_pos
found_text = original_text[start_pos:end_pos]
return start_pos, end_pos, found_text
def apply_entity_masking_with_alignment_fixed(original_document_text: str, entity_mapping: dict):
"""Fixed implementation that handles multiple occurrences"""
masked_document = original_document_text
sorted_entities = sorted(entity_mapping.keys(), key=len, reverse=True)
for entity_text in sorted_entities:
masked_text = entity_mapping[entity_text]
# Find ALL occurrences of this entity in the document
# We need to loop until no more matches are found
while True:
# Find the entity in the current masked document using alignment
alignment_result = find_entity_alignment(entity_text, masked_document)
if alignment_result:
start_pos, end_pos, found_text = alignment_result
# Replace the found text with the masked version
masked_document = (
masked_document[:start_pos] +
masked_text +
masked_document[end_pos:]
)
print(f"Masked entity '{entity_text}' -> '{masked_text}' at positions {start_pos}-{end_pos}")
else:
# No more occurrences found for this entity, move to next entity
print(f"No more occurrences of '{entity_text}' found in document")
break
return masked_document
def test_fix_verification():
"""Test to verify the fix works correctly"""
print("Testing Fix for Multiple Occurrence Issue")
print("=" * 60)
# Test case 1: Multiple occurrences of the same entity
print("\nTest Case 1: Multiple occurrences of same entity")
test_document_1 = "上诉人李淼因合同纠纷,法定代表人李淼,委托代理人李淼。"
entity_mapping_1 = {"李淼": "李M"}
print(f"Original: {test_document_1}")
result_1 = apply_entity_masking_with_alignment_fixed(test_document_1, entity_mapping_1)
print(f"Result: {result_1}")
remaining_1 = result_1.count("李淼")
expected_1 = "上诉人李M因合同纠纷法定代表人李M委托代理人李M。"
if result_1 == expected_1 and remaining_1 == 0:
print("✅ PASS: All occurrences masked correctly")
else:
print(f"❌ FAIL: Expected '{expected_1}', got '{result_1}'")
print(f" Remaining '李淼' occurrences: {remaining_1}")
# Test case 2: Multiple entities with multiple occurrences
print("\nTest Case 2: Multiple entities with multiple occurrences")
test_document_2 = "上诉人李淼因合同纠纷,法定代表人李淼。北京丰复久信营销科技有限公司,丰复久信公司。"
entity_mapping_2 = {
"李淼": "李M",
"北京丰复久信营销科技有限公司": "北京JO营销科技有限公司",
"丰复久信公司": "丰复久信公司"
}
print(f"Original: {test_document_2}")
result_2 = apply_entity_masking_with_alignment_fixed(test_document_2, entity_mapping_2)
print(f"Result: {result_2}")
remaining_2_li = result_2.count("李淼")
remaining_2_company = result_2.count("北京丰复久信营销科技有限公司")
if remaining_2_li == 0 and remaining_2_company == 0:
print("✅ PASS: All entities masked correctly")
else:
print(f"❌ FAIL: Remaining '李淼': {remaining_2_li}, '北京丰复久信营销科技有限公司': {remaining_2_company}")
# Test case 3: Mixed spacing scenarios
print("\nTest Case 3: Mixed spacing scenarios")
test_document_3 = "上诉人李 淼因合同纠纷,法定代表人李淼,委托代理人李 淼。"
entity_mapping_3 = {"李 淼": "李M", "李淼": "李M"}
print(f"Original: {test_document_3}")
result_3 = apply_entity_masking_with_alignment_fixed(test_document_3, entity_mapping_3)
print(f"Result: {result_3}")
remaining_3 = result_3.count("李淼") + result_3.count("李 淼")
if remaining_3 == 0:
print("✅ PASS: Mixed spacing handled correctly")
else:
print(f"❌ FAIL: Remaining occurrences: {remaining_3}")
# Test case 4: Complex document with real examples
print("\nTest Case 4: Complex document with real examples")
test_document_4 = """上诉人原审原告北京丰复久信营销科技有限公司住所地北京市海淀区北小马厂6号1号楼华天大厦1306室。
法定代表人郭东军执行董事经理
委托诉讼代理人周大海北京市康达律师事务所律师
委托诉讼代理人王乃哲北京市康达律师事务所律师
被上诉人原审被告中研智创区块链技术有限公司住所地天津市津南区双港镇工业园区优谷产业园5号楼-1505
法定代表人王欢子总经理
委托诉讼代理人魏鑫北京市昊衡律师事务所律师"""
entity_mapping_4 = {
"北京丰复久信营销科技有限公司": "北京JO营销科技有限公司",
"郭东军": "郭DJ",
"周大海": "周DH",
"王乃哲": "王NZ",
"中研智创区块链技术有限公司": "中研智创区块链技术有限公司",
"王欢子": "王HZ",
"魏鑫": "魏X",
"北京市康达律师事务所": "北京市KD律师事务所",
"北京市昊衡律师事务所": "北京市HH律师事务所"
}
print(f"Original length: {len(test_document_4)} characters")
result_4 = apply_entity_masking_with_alignment_fixed(test_document_4, entity_mapping_4)
print(f"Result length: {len(result_4)} characters")
# Check that all entities were masked
unmasked_entities = []
for entity in entity_mapping_4.keys():
if entity in result_4:
unmasked_entities.append(entity)
if not unmasked_entities:
print("✅ PASS: All entities masked in complex document")
else:
print(f"❌ FAIL: Unmasked entities: {unmasked_entities}")
print("\n" + "=" * 60)
print("Fix Verification Completed!")
if __name__ == "__main__":
test_fix_verification()

View File

@ -0,0 +1,96 @@
#!/usr/bin/env python3
"""
Test to verify the multiple occurrence issue in apply_entity_masking_with_alignment.
"""
def find_entity_alignment(entity_text: str, original_document_text: str):
"""Simplified version of the alignment method for testing"""
clean_entity = entity_text.replace(" ", "")
doc_chars = [c for c in original_document_text if c != ' ']
for i in range(len(doc_chars) - len(clean_entity) + 1):
if doc_chars[i:i+len(clean_entity)] == list(clean_entity):
return map_char_positions_to_original(i, len(clean_entity), original_document_text)
return None
def map_char_positions_to_original(clean_start: int, entity_length: int, original_text: str):
"""Simplified version of position mapping for testing"""
original_pos = 0
clean_pos = 0
while clean_pos < clean_start and original_pos < len(original_text):
if original_text[original_pos] != ' ':
clean_pos += 1
original_pos += 1
start_pos = original_pos
chars_found = 0
while chars_found < entity_length and original_pos < len(original_text):
if original_text[original_pos] != ' ':
chars_found += 1
original_pos += 1
end_pos = original_pos
found_text = original_text[start_pos:end_pos]
return start_pos, end_pos, found_text
def apply_entity_masking_with_alignment_current(original_document_text: str, entity_mapping: dict):
"""Current implementation with the bug"""
masked_document = original_document_text
sorted_entities = sorted(entity_mapping.keys(), key=len, reverse=True)
for entity_text in sorted_entities:
masked_text = entity_mapping[entity_text]
# Find the entity in the original document using alignment
alignment_result = find_entity_alignment(entity_text, masked_document)
if alignment_result:
start_pos, end_pos, found_text = alignment_result
# Replace the found text with the masked version
masked_document = (
masked_document[:start_pos] +
masked_text +
masked_document[end_pos:]
)
print(f"Masked entity '{entity_text}' -> '{masked_text}' at positions {start_pos}-{end_pos}")
else:
print(f"Could not find entity '{entity_text}' in document for masking")
return masked_document
def test_multiple_occurrences():
"""Test the multiple occurrence issue"""
print("Testing Multiple Occurrence Issue")
print("=" * 50)
# Test document with multiple occurrences of the same entity
test_document = "上诉人李淼因合同纠纷,法定代表人李淼,委托代理人李淼。"
entity_mapping = {
"李淼": "李M"
}
print(f"Original document: {test_document}")
print(f"Entity mapping: {entity_mapping}")
print(f"Expected: All 3 occurrences of '李淼' should be masked")
# Test current implementation
result = apply_entity_masking_with_alignment_current(test_document, entity_mapping)
print(f"Current result: {result}")
# Count remaining occurrences
remaining_count = result.count("李淼")
print(f"Remaining '李淼' occurrences: {remaining_count}")
if remaining_count > 0:
print("❌ ISSUE CONFIRMED: Multiple occurrences are not being masked!")
else:
print("✅ No issue found (unexpected)")
if __name__ == "__main__":
test_multiple_occurrences()