135 lines
4.9 KiB
Python
135 lines
4.9 KiB
Python
#!/usr/bin/env python3
|
||
"""
|
||
Test script for NER extractor integration
|
||
"""
|
||
|
||
import sys
|
||
import os
|
||
import logging
|
||
|
||
# Add the backend directory to the Python path
|
||
sys.path.insert(0, os.path.join(os.path.dirname(__file__), 'backend'))
|
||
|
||
from app.core.document_handlers.extractors.ner_extractor import NERExtractor
|
||
from app.core.document_handlers.ner_processor import NerProcessor
|
||
|
||
# Configure logging
|
||
logging.basicConfig(level=logging.INFO)
|
||
logger = logging.getLogger(__name__)
|
||
|
||
def test_ner_extractor():
|
||
"""Test the NER extractor directly"""
|
||
print("🧪 Testing NER Extractor")
|
||
print("=" * 50)
|
||
|
||
# Sample legal text
|
||
text_to_analyze = """
|
||
上诉人(原审原告):北京丰复久信营销科技有限公司,住所地北京市海淀区北小马厂6号1号楼华天大厦1306室。
|
||
法定代表人:郭东军,执行董事、经理。
|
||
委托诉讼代理人:周大海,北京市康达律师事务所律师。
|
||
被上诉人(原审被告):中研智创区块链技术有限公司,住所地天津市津南区双港镇工业园区优谷产业园5号楼-1505。
|
||
法定代表人:王欢子,总经理。
|
||
"""
|
||
|
||
try:
|
||
# Test NER extractor
|
||
print("1. Testing NER Extractor...")
|
||
ner_extractor = NERExtractor()
|
||
|
||
# Get model info
|
||
model_info = ner_extractor.get_model_info()
|
||
print(f" Model: {model_info['model_name']}")
|
||
print(f" Supported entities: {model_info['supported_entities']}")
|
||
|
||
# Extract entities
|
||
result = ner_extractor.extract_and_summarize(text_to_analyze)
|
||
|
||
print(f"\n2. Extraction Results:")
|
||
print(f" Total entities found: {result['total_count']}")
|
||
|
||
for entity in result['entities']:
|
||
print(f" - '{entity['text']}' ({entity['type']}) - Confidence: {entity['confidence']:.4f}")
|
||
|
||
print(f"\n3. Summary:")
|
||
for entity_type, texts in result['summary']['summary'].items():
|
||
print(f" {entity_type}: {len(texts)} entities")
|
||
for text in texts:
|
||
print(f" - {text}")
|
||
|
||
return True
|
||
|
||
except Exception as e:
|
||
print(f"❌ NER Extractor test failed: {str(e)}")
|
||
return False
|
||
|
||
def test_ner_processor():
|
||
"""Test the NER processor integration"""
|
||
print("\n🧪 Testing NER Processor Integration")
|
||
print("=" * 50)
|
||
|
||
# Sample legal text
|
||
text_to_analyze = """
|
||
上诉人(原审原告):北京丰复久信营销科技有限公司,住所地北京市海淀区北小马厂6号1号楼华天大厦1306室。
|
||
法定代表人:郭东军,执行董事、经理。
|
||
委托诉讼代理人:周大海,北京市康达律师事务所律师。
|
||
被上诉人(原审被告):中研智创区块链技术有限公司,住所地天津市津南区双港镇工业园区优谷产业园5号楼-1505。
|
||
法定代表人:王欢子,总经理。
|
||
"""
|
||
|
||
try:
|
||
# Test NER processor
|
||
print("1. Testing NER Processor...")
|
||
ner_processor = NerProcessor()
|
||
|
||
# Test NER-only extraction
|
||
print("2. Testing NER-only entity extraction...")
|
||
ner_entities = ner_processor.extract_entities_with_ner(text_to_analyze)
|
||
print(f" Extracted {len(ner_entities)} entities with NER model")
|
||
|
||
for entity in ner_entities:
|
||
print(f" - '{entity['text']}' ({entity['type']}) - Confidence: {entity['confidence']:.4f}")
|
||
|
||
# Test NER-only processing
|
||
print("\n3. Testing NER-only document processing...")
|
||
chunks = [text_to_analyze] # Single chunk for testing
|
||
mapping = ner_processor.process_ner_only(chunks)
|
||
|
||
print(f" Generated {len(mapping)} masking mappings")
|
||
for original, masked in mapping.items():
|
||
print(f" '{original}' -> '{masked}'")
|
||
|
||
return True
|
||
|
||
except Exception as e:
|
||
print(f"❌ NER Processor test failed: {str(e)}")
|
||
return False
|
||
|
||
def main():
|
||
"""Main test function"""
|
||
print("🧪 NER Integration Test Suite")
|
||
print("=" * 60)
|
||
|
||
# Test 1: NER Extractor
|
||
extractor_success = test_ner_extractor()
|
||
|
||
# Test 2: NER Processor Integration
|
||
processor_success = test_ner_processor()
|
||
|
||
# Summary
|
||
print("\n" + "=" * 60)
|
||
print("📊 Test Summary:")
|
||
print(f" NER Extractor: {'✅' if extractor_success else '❌'}")
|
||
print(f" NER Processor: {'✅' if processor_success else '❌'}")
|
||
|
||
if extractor_success and processor_success:
|
||
print("\n🎉 All tests passed! NER integration is working correctly.")
|
||
print("\nNext steps:")
|
||
print("1. The NER extractor is ready to use in the document processing pipeline")
|
||
print("2. You can use process_ner_only() for ML-based entity extraction")
|
||
print("3. The existing process() method now includes NER extraction")
|
||
else:
|
||
print("\n⚠️ Some tests failed. Please check the error messages above.")
|
||
|
||
if __name__ == "__main__":
|
||
main()
|