legal-doc-masker/backend/tests/test_ner_extractor.py

#!/usr/bin/env python3
"""
Test script for NER extractor integration
"""

import sys
import os
import logging

# Add the backend directory to the Python path
sys.path.insert(0, os.path.join(os.path.dirname(__file__), 'backend'))

from app.core.document_handlers.extractors.ner_extractor import NERExtractor
from app.core.document_handlers.ner_processor import NerProcessor

# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

def test_ner_extractor():
    """Test the NER extractor directly"""
    print("🧪 Testing NER Extractor")
    print("=" * 50)

    # Sample legal text
    text_to_analyze = """
上诉人（原审原告）：北京丰复久信营销科技有限公司，住所地北京市海淀区北小马厂6号1号楼华天大厦1306室。
法定代表人：郭东军，执行董事、经理。
委托诉讼代理人：周大海，北京市康达律师事务所律师。
被上诉人（原审被告）：中研智创区块链技术有限公司，住所地天津市津南区双港镇工业园区优谷产业园5号楼-1505。
法定代表人：王欢子，总经理。
"""

    try:
        # Test NER extractor
        print("1. Testing NER Extractor...")
        ner_extractor = NERExtractor()

        # Get model info
        model_info = ner_extractor.get_model_info()
        print(f"   Model: {model_info['model_name']}")
        print(f"   Supported entities: {model_info['supported_entities']}")

        # Extract entities
        result = ner_extractor.extract_and_summarize(text_to_analyze)

        print(f"\n2. Extraction Results:")
        print(f"   Total entities found: {result['total_count']}")

        for entity in result['entities']:
            print(f"   - '{entity['text']}' ({entity['type']}) - Confidence: {entity['confidence']:.4f}")

        print(f"\n3. Summary:")
        for entity_type, texts in result['summary']['summary'].items():
            print(f"   {entity_type}: {len(texts)} entities")
            for text in texts:
                print(f"     - {text}")

        return True

    except Exception as e:
        print(f"❌ NER Extractor test failed: {str(e)}")
        return False

def test_ner_processor():
    """Test the NER processor integration"""
    print("\n🧪 Testing NER Processor Integration")
    print("=" * 50)

    # Sample legal text
    text_to_analyze = """
上诉人（原审原告）：北京丰复久信营销科技有限公司，住所地北京市海淀区北小马厂6号1号楼华天大厦1306室。
法定代表人：郭东军，执行董事、经理。
委托诉讼代理人：周大海，北京市康达律师事务所律师。
被上诉人（原审被告）：中研智创区块链技术有限公司，住所地天津市津南区双港镇工业园区优谷产业园5号楼-1505。
法定代表人：王欢子，总经理。
"""

    try:
        # Test NER processor
        print("1. Testing NER Processor...")
        ner_processor = NerProcessor()

        # Test NER-only extraction
        print("2. Testing NER-only entity extraction...")
        ner_entities = ner_processor.extract_entities_with_ner(text_to_analyze)
        print(f"   Extracted {len(ner_entities)} entities with NER model")

        for entity in ner_entities:
            print(f"   - '{entity['text']}' ({entity['type']}) - Confidence: {entity['confidence']:.4f}")

        # Test NER-only processing
        print("\n3. Testing NER-only document processing...")
        chunks = [text_to_analyze]  # Single chunk for testing
        mapping = ner_processor.process_ner_only(chunks)

        print(f"   Generated {len(mapping)} masking mappings")
        for original, masked in mapping.items():
            print(f"   '{original}' -> '{masked}'")

        return True

    except Exception as e:
        print(f"❌ NER Processor test failed: {str(e)}")
        return False

def main():
    """Main test function"""
    print("🧪 NER Integration Test Suite")
    print("=" * 60)

    # Test 1: NER Extractor
    extractor_success = test_ner_extractor()

    # Test 2: NER Processor Integration
    processor_success = test_ner_processor()

    # Summary
    print("\n" + "=" * 60)
    print("📊 Test Summary:")
    print(f"   NER Extractor: {'✅' if extractor_success else '❌'}")
    print(f"   NER Processor: {'✅' if processor_success else '❌'}")

    if extractor_success and processor_success:
        print("\n🎉 All tests passed! NER integration is working correctly.")
        print("\nNext steps:")
        print("1. The NER extractor is ready to use in the document processing pipeline")
        print("2. You can use process_ner_only() for ML-based entity extraction")
        print("3. The existing process() method now includes NER extraction")
    else:
        print("\n⚠️  Some tests failed. Please check the error messages above.")

if __name__ == "__main__":
    main()