#!/usr/bin/env python3 """ Test script for character-by-character alignment functionality. This script demonstrates how the alignment handles different spacing patterns between entity text and original document text. """ import sys import os sys.path.append(os.path.join(os.path.dirname(__file__), 'backend')) from app.core.document_handlers.ner_processor import NerProcessor def main(): """Test the character alignment functionality.""" processor = NerProcessor() print("Testing Character-by-Character Alignment") print("=" * 50) # Test the alignment functionality processor.test_character_alignment() print("\n" + "=" * 50) print("Testing Entity Masking with Alignment") print("=" * 50) # Test entity masking with alignment original_document = "上诉人(原审原告):北京丰复久信营销科技有限公司,住所地北京市海淀区北小马厂6号1号楼华天大厦1306室。法定代表人:郭东军,执行董事、经理。委托诉讼代理人:周大海,北京市康达律师事务所律师。" # Example entity mapping (from your NER results) entity_mapping = { "北京丰复久信营销科技有限公司": "北京JO营销科技有限公司", "郭东军": "郭DJ", "周大海": "周DH", "北京市康达律师事务所": "北京市KD律师事务所" } print(f"Original document: {original_document}") print(f"Entity mapping: {entity_mapping}") # Apply masking with alignment masked_document = processor.apply_entity_masking_with_alignment( original_document, entity_mapping ) print(f"Masked document: {masked_document}") # Test with document that has spaces print("\n" + "=" * 50) print("Testing with Document Containing Spaces") print("=" * 50) spaced_document = "上诉人(原审原告):北京 丰复久信 营销科技 有限公司,住所地北京市海淀区北小马厂6号1号楼华天大厦1306室。法定代表人:郭 东 军,执行董事、经理。" print(f"Spaced document: {spaced_document}") masked_spaced_document = processor.apply_entity_masking_with_alignment( spaced_document, entity_mapping ) print(f"Masked spaced document: {masked_spaced_document}") if __name__ == "__main__": main()