legal-doc-masker/backend/tests/test_character_alignment.py

68 lines
2.3 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env python3
"""
Test script for character-by-character alignment functionality.
This script demonstrates how the alignment handles different spacing patterns
between entity text and original document text.
"""
import sys
import os
sys.path.append(os.path.join(os.path.dirname(__file__), 'backend'))
from app.core.document_handlers.ner_processor import NerProcessor
def main():
"""Test the character alignment functionality."""
processor = NerProcessor()
print("Testing Character-by-Character Alignment")
print("=" * 50)
# Test the alignment functionality
processor.test_character_alignment()
print("\n" + "=" * 50)
print("Testing Entity Masking with Alignment")
print("=" * 50)
# Test entity masking with alignment
original_document = "上诉人原审原告北京丰复久信营销科技有限公司住所地北京市海淀区北小马厂6号1号楼华天大厦1306室。法定代表人郭东军执行董事、经理。委托诉讼代理人周大海北京市康达律师事务所律师。"
# Example entity mapping (from your NER results)
entity_mapping = {
"北京丰复久信营销科技有限公司": "北京JO营销科技有限公司",
"郭东军": "郭DJ",
"周大海": "周DH",
"北京市康达律师事务所": "北京市KD律师事务所"
}
print(f"Original document: {original_document}")
print(f"Entity mapping: {entity_mapping}")
# Apply masking with alignment
masked_document = processor.apply_entity_masking_with_alignment(
original_document,
entity_mapping
)
print(f"Masked document: {masked_document}")
# Test with document that has spaces
print("\n" + "=" * 50)
print("Testing with Document Containing Spaces")
print("=" * 50)
spaced_document = "上诉人(原审原告):北京 丰复久信 营销科技 有限公司住所地北京市海淀区北小马厂6号1号楼华天大厦1306室。法定代表人郭 东 军,执行董事、经理。"
print(f"Spaced document: {spaced_document}")
masked_spaced_document = processor.apply_entity_masking_with_alignment(
spaced_document,
entity_mapping
)
print(f"Masked spaced document: {masked_spaced_document}")
if __name__ == "__main__":
main()