#!/usr/bin/env python3 """ Test script for NER extractor integration """ import sys import os import logging # Add the backend directory to the Python path sys.path.insert(0, os.path.join(os.path.dirname(__file__), 'backend')) from app.core.document_handlers.extractors.ner_extractor import NERExtractor from app.core.document_handlers.ner_processor import NerProcessor # Configure logging logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) def test_ner_extractor(): """Test the NER extractor directly""" print("🧪 Testing NER Extractor") print("=" * 50) # Sample legal text text_to_analyze = """ 上诉人(原审原告):北京丰复久信营销科技有限公司,住所地北京市海淀区北小马厂6号1号楼华天大厦1306室。 法定代表人:郭东军,执行董事、经理。 委托诉讼代理人:周大海,北京市康达律师事务所律师。 被上诉人(原审被告):中研智创区块链技术有限公司,住所地天津市津南区双港镇工业园区优谷产业园5号楼-1505。 法定代表人:王欢子,总经理。 """ try: # Test NER extractor print("1. Testing NER Extractor...") ner_extractor = NERExtractor() # Get model info model_info = ner_extractor.get_model_info() print(f" Model: {model_info['model_name']}") print(f" Supported entities: {model_info['supported_entities']}") # Extract entities result = ner_extractor.extract_and_summarize(text_to_analyze) print(f"\n2. Extraction Results:") print(f" Total entities found: {result['total_count']}") for entity in result['entities']: print(f" - '{entity['text']}' ({entity['type']}) - Confidence: {entity['confidence']:.4f}") print(f"\n3. Summary:") for entity_type, texts in result['summary']['summary'].items(): print(f" {entity_type}: {len(texts)} entities") for text in texts: print(f" - {text}") return True except Exception as e: print(f"❌ NER Extractor test failed: {str(e)}") return False def test_ner_processor(): """Test the NER processor integration""" print("\n🧪 Testing NER Processor Integration") print("=" * 50) # Sample legal text text_to_analyze = """ 上诉人(原审原告):北京丰复久信营销科技有限公司,住所地北京市海淀区北小马厂6号1号楼华天大厦1306室。 法定代表人:郭东军,执行董事、经理。 委托诉讼代理人:周大海,北京市康达律师事务所律师。 被上诉人(原审被告):中研智创区块链技术有限公司,住所地天津市津南区双港镇工业园区优谷产业园5号楼-1505。 法定代表人:王欢子,总经理。 """ try: # Test NER processor print("1. Testing NER Processor...") ner_processor = NerProcessor() # Test NER-only extraction print("2. Testing NER-only entity extraction...") ner_entities = ner_processor.extract_entities_with_ner(text_to_analyze) print(f" Extracted {len(ner_entities)} entities with NER model") for entity in ner_entities: print(f" - '{entity['text']}' ({entity['type']}) - Confidence: {entity['confidence']:.4f}") # Test NER-only processing print("\n3. Testing NER-only document processing...") chunks = [text_to_analyze] # Single chunk for testing mapping = ner_processor.process_ner_only(chunks) print(f" Generated {len(mapping)} masking mappings") for original, masked in mapping.items(): print(f" '{original}' -> '{masked}'") return True except Exception as e: print(f"❌ NER Processor test failed: {str(e)}") return False def main(): """Main test function""" print("🧪 NER Integration Test Suite") print("=" * 60) # Test 1: NER Extractor extractor_success = test_ner_extractor() # Test 2: NER Processor Integration processor_success = test_ner_processor() # Summary print("\n" + "=" * 60) print("📊 Test Summary:") print(f" NER Extractor: {'✅' if extractor_success else '❌'}") print(f" NER Processor: {'✅' if processor_success else '❌'}") if extractor_success and processor_success: print("\n🎉 All tests passed! NER integration is working correctly.") print("\nNext steps:") print("1. The NER extractor is ready to use in the document processing pipeline") print("2. You can use process_ner_only() for ML-based entity extraction") print("3. The existing process() method now includes NER extraction") else: print("\n⚠️ Some tests failed. Please check the error messages above.") if __name__ == "__main__": main()