#!/usr/bin/env python3 """ Debug script to understand the position mapping issue after masking. """ def find_entity_alignment(entity_text: str, original_document_text: str): """Simplified version of the alignment method for testing""" clean_entity = entity_text.replace(" ", "") doc_chars = [c for c in original_document_text if c != ' '] for i in range(len(doc_chars) - len(clean_entity) + 1): if doc_chars[i:i+len(clean_entity)] == list(clean_entity): return map_char_positions_to_original(i, len(clean_entity), original_document_text) return None def map_char_positions_to_original(clean_start: int, entity_length: int, original_text: str): """Simplified version of position mapping for testing""" original_pos = 0 clean_pos = 0 while clean_pos < clean_start and original_pos < len(original_text): if original_text[original_pos] != ' ': clean_pos += 1 original_pos += 1 start_pos = original_pos chars_found = 0 while chars_found < entity_length and original_pos < len(original_text): if original_text[original_pos] != ' ': chars_found += 1 original_pos += 1 end_pos = original_pos found_text = original_text[start_pos:end_pos] return start_pos, end_pos, found_text def debug_position_issue(): """Debug the position mapping issue""" print("Debugging Position Mapping Issue") print("=" * 50) # Test document original_doc = "上诉人李淼因合同纠纷,法定代表人李淼,委托代理人李淼。" entity = "李淼" masked_text = "李M" print(f"Original document: '{original_doc}'") print(f"Entity to mask: '{entity}'") print(f"Masked text: '{masked_text}'") print() # First occurrence print("=== First Occurrence ===") result1 = find_entity_alignment(entity, original_doc) if result1: start1, end1, found1 = result1 print(f"Found at positions {start1}-{end1}: '{found1}'") # Apply first mask masked_doc = original_doc[:start1] + masked_text + original_doc[end1:] print(f"After first mask: '{masked_doc}'") print(f"Length changed from {len(original_doc)} to {len(masked_doc)}") # Try to find second occurrence in the masked document print("\n=== Second Occurrence (in masked document) ===") result2 = find_entity_alignment(entity, masked_doc) if result2: start2, end2, found2 = result2 print(f"Found at positions {start2}-{end2}: '{found2}'") # Apply second mask masked_doc2 = masked_doc[:start2] + masked_text + masked_doc[end2:] print(f"After second mask: '{masked_doc2}'") # Try to find third occurrence print("\n=== Third Occurrence (in double-masked document) ===") result3 = find_entity_alignment(entity, masked_doc2) if result3: start3, end3, found3 = result3 print(f"Found at positions {start3}-{end3}: '{found3}'") else: print("No third occurrence found") else: print("No second occurrence found") else: print("No first occurrence found") def debug_infinite_loop(): """Debug the infinite loop issue""" print("\n" + "=" * 50) print("Debugging Infinite Loop Issue") print("=" * 50) # Test document that causes infinite loop original_doc = "上诉人李淼因合同纠纷,法定代表人李淼。北京丰复久信营销科技有限公司,丰复久信公司。" entity = "丰复久信公司" masked_text = "丰复久信公司" # Same text (no change) print(f"Original document: '{original_doc}'") print(f"Entity to mask: '{entity}'") print(f"Masked text: '{masked_text}' (same as original)") print() # This will cause infinite loop because we're replacing with the same text print("=== This will cause infinite loop ===") print("Because we're replacing '丰复久信公司' with '丰复久信公司'") print("The document doesn't change, so we keep finding the same position") # Show what happens masked_doc = original_doc for i in range(3): # Limit to 3 iterations for demo result = find_entity_alignment(entity, masked_doc) if result: start, end, found = result print(f"Iteration {i+1}: Found at positions {start}-{end}: '{found}'") # Apply mask (but it's the same text) masked_doc = masked_doc[:start] + masked_text + masked_doc[end:] print(f"After mask: '{masked_doc}'") else: print(f"Iteration {i+1}: No occurrence found") break if __name__ == "__main__": debug_position_issue() debug_infinite_loop()