131 lines
4.8 KiB
Python
131 lines
4.8 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Debug script to understand the position mapping issue after masking.
|
|
"""
|
|
|
|
def find_entity_alignment(entity_text: str, original_document_text: str):
|
|
"""Simplified version of the alignment method for testing"""
|
|
clean_entity = entity_text.replace(" ", "")
|
|
doc_chars = [c for c in original_document_text if c != ' ']
|
|
|
|
for i in range(len(doc_chars) - len(clean_entity) + 1):
|
|
if doc_chars[i:i+len(clean_entity)] == list(clean_entity):
|
|
return map_char_positions_to_original(i, len(clean_entity), original_document_text)
|
|
return None
|
|
|
|
def map_char_positions_to_original(clean_start: int, entity_length: int, original_text: str):
|
|
"""Simplified version of position mapping for testing"""
|
|
original_pos = 0
|
|
clean_pos = 0
|
|
|
|
while clean_pos < clean_start and original_pos < len(original_text):
|
|
if original_text[original_pos] != ' ':
|
|
clean_pos += 1
|
|
original_pos += 1
|
|
|
|
start_pos = original_pos
|
|
|
|
chars_found = 0
|
|
while chars_found < entity_length and original_pos < len(original_text):
|
|
if original_text[original_pos] != ' ':
|
|
chars_found += 1
|
|
original_pos += 1
|
|
|
|
end_pos = original_pos
|
|
found_text = original_text[start_pos:end_pos]
|
|
|
|
return start_pos, end_pos, found_text
|
|
|
|
def debug_position_issue():
|
|
"""Debug the position mapping issue"""
|
|
|
|
print("Debugging Position Mapping Issue")
|
|
print("=" * 50)
|
|
|
|
# Test document
|
|
original_doc = "上诉人李淼因合同纠纷,法定代表人李淼,委托代理人李淼。"
|
|
entity = "李淼"
|
|
masked_text = "李M"
|
|
|
|
print(f"Original document: '{original_doc}'")
|
|
print(f"Entity to mask: '{entity}'")
|
|
print(f"Masked text: '{masked_text}'")
|
|
print()
|
|
|
|
# First occurrence
|
|
print("=== First Occurrence ===")
|
|
result1 = find_entity_alignment(entity, original_doc)
|
|
if result1:
|
|
start1, end1, found1 = result1
|
|
print(f"Found at positions {start1}-{end1}: '{found1}'")
|
|
|
|
# Apply first mask
|
|
masked_doc = original_doc[:start1] + masked_text + original_doc[end1:]
|
|
print(f"After first mask: '{masked_doc}'")
|
|
print(f"Length changed from {len(original_doc)} to {len(masked_doc)}")
|
|
|
|
# Try to find second occurrence in the masked document
|
|
print("\n=== Second Occurrence (in masked document) ===")
|
|
result2 = find_entity_alignment(entity, masked_doc)
|
|
if result2:
|
|
start2, end2, found2 = result2
|
|
print(f"Found at positions {start2}-{end2}: '{found2}'")
|
|
|
|
# Apply second mask
|
|
masked_doc2 = masked_doc[:start2] + masked_text + masked_doc[end2:]
|
|
print(f"After second mask: '{masked_doc2}'")
|
|
|
|
# Try to find third occurrence
|
|
print("\n=== Third Occurrence (in double-masked document) ===")
|
|
result3 = find_entity_alignment(entity, masked_doc2)
|
|
if result3:
|
|
start3, end3, found3 = result3
|
|
print(f"Found at positions {start3}-{end3}: '{found3}'")
|
|
else:
|
|
print("No third occurrence found")
|
|
else:
|
|
print("No second occurrence found")
|
|
else:
|
|
print("No first occurrence found")
|
|
|
|
def debug_infinite_loop():
|
|
"""Debug the infinite loop issue"""
|
|
|
|
print("\n" + "=" * 50)
|
|
print("Debugging Infinite Loop Issue")
|
|
print("=" * 50)
|
|
|
|
# Test document that causes infinite loop
|
|
original_doc = "上诉人李淼因合同纠纷,法定代表人李淼。北京丰复久信营销科技有限公司,丰复久信公司。"
|
|
entity = "丰复久信公司"
|
|
masked_text = "丰复久信公司" # Same text (no change)
|
|
|
|
print(f"Original document: '{original_doc}'")
|
|
print(f"Entity to mask: '{entity}'")
|
|
print(f"Masked text: '{masked_text}' (same as original)")
|
|
print()
|
|
|
|
# This will cause infinite loop because we're replacing with the same text
|
|
print("=== This will cause infinite loop ===")
|
|
print("Because we're replacing '丰复久信公司' with '丰复久信公司'")
|
|
print("The document doesn't change, so we keep finding the same position")
|
|
|
|
# Show what happens
|
|
masked_doc = original_doc
|
|
for i in range(3): # Limit to 3 iterations for demo
|
|
result = find_entity_alignment(entity, masked_doc)
|
|
if result:
|
|
start, end, found = result
|
|
print(f"Iteration {i+1}: Found at positions {start}-{end}: '{found}'")
|
|
|
|
# Apply mask (but it's the same text)
|
|
masked_doc = masked_doc[:start] + masked_text + masked_doc[end:]
|
|
print(f"After mask: '{masked_doc}'")
|
|
else:
|
|
print(f"Iteration {i+1}: No occurrence found")
|
|
break
|
|
|
|
if __name__ == "__main__":
|
|
debug_position_issue()
|
|
debug_infinite_loop()
|