legal-doc-masker/backend/tests/test_ner_extractor.py

135 lines
4.9 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env python3
"""
Test script for NER extractor integration
"""
import sys
import os
import logging
# Add the backend directory to the Python path
sys.path.insert(0, os.path.join(os.path.dirname(__file__), 'backend'))
from app.core.document_handlers.extractors.ner_extractor import NERExtractor
from app.core.document_handlers.ner_processor import NerProcessor
# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
def test_ner_extractor():
"""Test the NER extractor directly"""
print("🧪 Testing NER Extractor")
print("=" * 50)
# Sample legal text
text_to_analyze = """
上诉人原审原告北京丰复久信营销科技有限公司住所地北京市海淀区北小马厂6号1号楼华天大厦1306室。
法定代表人:郭东军,执行董事、经理。
委托诉讼代理人:周大海,北京市康达律师事务所律师。
被上诉人原审被告中研智创区块链技术有限公司住所地天津市津南区双港镇工业园区优谷产业园5号楼-1505。
法定代表人:王欢子,总经理。
"""
try:
# Test NER extractor
print("1. Testing NER Extractor...")
ner_extractor = NERExtractor()
# Get model info
model_info = ner_extractor.get_model_info()
print(f" Model: {model_info['model_name']}")
print(f" Supported entities: {model_info['supported_entities']}")
# Extract entities
result = ner_extractor.extract_and_summarize(text_to_analyze)
print(f"\n2. Extraction Results:")
print(f" Total entities found: {result['total_count']}")
for entity in result['entities']:
print(f" - '{entity['text']}' ({entity['type']}) - Confidence: {entity['confidence']:.4f}")
print(f"\n3. Summary:")
for entity_type, texts in result['summary']['summary'].items():
print(f" {entity_type}: {len(texts)} entities")
for text in texts:
print(f" - {text}")
return True
except Exception as e:
print(f"❌ NER Extractor test failed: {str(e)}")
return False
def test_ner_processor():
"""Test the NER processor integration"""
print("\n🧪 Testing NER Processor Integration")
print("=" * 50)
# Sample legal text
text_to_analyze = """
上诉人原审原告北京丰复久信营销科技有限公司住所地北京市海淀区北小马厂6号1号楼华天大厦1306室。
法定代表人:郭东军,执行董事、经理。
委托诉讼代理人:周大海,北京市康达律师事务所律师。
被上诉人原审被告中研智创区块链技术有限公司住所地天津市津南区双港镇工业园区优谷产业园5号楼-1505。
法定代表人:王欢子,总经理。
"""
try:
# Test NER processor
print("1. Testing NER Processor...")
ner_processor = NerProcessor()
# Test NER-only extraction
print("2. Testing NER-only entity extraction...")
ner_entities = ner_processor.extract_entities_with_ner(text_to_analyze)
print(f" Extracted {len(ner_entities)} entities with NER model")
for entity in ner_entities:
print(f" - '{entity['text']}' ({entity['type']}) - Confidence: {entity['confidence']:.4f}")
# Test NER-only processing
print("\n3. Testing NER-only document processing...")
chunks = [text_to_analyze] # Single chunk for testing
mapping = ner_processor.process_ner_only(chunks)
print(f" Generated {len(mapping)} masking mappings")
for original, masked in mapping.items():
print(f" '{original}' -> '{masked}'")
return True
except Exception as e:
print(f"❌ NER Processor test failed: {str(e)}")
return False
def main():
"""Main test function"""
print("🧪 NER Integration Test Suite")
print("=" * 60)
# Test 1: NER Extractor
extractor_success = test_ner_extractor()
# Test 2: NER Processor Integration
processor_success = test_ner_processor()
# Summary
print("\n" + "=" * 60)
print("📊 Test Summary:")
print(f" NER Extractor: {'' if extractor_success else ''}")
print(f" NER Processor: {'' if processor_success else ''}")
if extractor_success and processor_success:
print("\n🎉 All tests passed! NER integration is working correctly.")
print("\nNext steps:")
print("1. The NER extractor is ready to use in the document processing pipeline")
print("2. You can use process_ner_only() for ML-based entity extraction")
print("3. The existing process() method now includes NER extraction")
else:
print("\n⚠️ Some tests failed. Please check the error messages above.")
if __name__ == "__main__":
main()