From 0f158c159b091844d3e5d3a037272643459cd39d Mon Sep 17 00:00:00 2001 From: oliviamn Date: Thu, 8 May 2025 00:04:50 +0800 Subject: [PATCH] Enhance PDF content masking by introducing mapping prompts - Added a new function `get_masking_mapping_prompt` to generate prompts for creating a mapping of original names/companies to their masked versions. - Updated `PdfDocumentProcessor` to utilize the new mapping prompt, processing each sentence individually for improved content masking. --- .../processors/pdf_processor.py | 17 ++++--- src/prompts/masking_prompts.py | 44 +++++++++++++++++++ 2 files changed, 55 insertions(+), 6 deletions(-) diff --git a/src/document_handlers/processors/pdf_processor.py b/src/document_handlers/processors/pdf_processor.py index e174a23..1d9d1ff 100644 --- a/src/document_handlers/processors/pdf_processor.py +++ b/src/document_handlers/processors/pdf_processor.py @@ -5,7 +5,7 @@ from magic_pdf.data.data_reader_writer import FileBasedDataWriter, FileBasedData from magic_pdf.data.dataset import PymuDocDataset from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze from magic_pdf.config.enums import SupportedPdfParseMethod -from prompts.masking_prompts import get_masking_prompt +from prompts.masking_prompts import get_masking_prompt, get_masking_mapping_prompt import logging from services.ollama_client import OllamaClient from config.settings import settings @@ -79,11 +79,16 @@ class PdfDocumentProcessor(DocumentProcessor): pipe_result.dump_middle_json(md_writer, f'{self.name_without_suff}_middle.json') logger.info("Masking content") - formatted_prompt = get_masking_prompt(md_content) - logger.info("Calling ollama to generate response") - response = self.ollama_client.generate(formatted_prompt) - logger.info("Response generated") - return response + + sentences = md_content.split("。") + final_md = "" + for sentence in sentences: + formatted_prompt = get_masking_mapping_prompt(sentence) + logger.info("Calling ollama to generate response, prompt: %s", formatted_prompt) + response = self.ollama_client.generate(formatted_prompt) + logger.info(f"Response generated: {response}") + final_md += response + "。" + return final_md diff --git a/src/prompts/masking_prompts.py b/src/prompts/masking_prompts.py index 98a610a..69461da 100644 --- a/src/prompts/masking_prompts.py +++ b/src/prompts/masking_prompts.py @@ -30,4 +30,48 @@ def get_masking_prompt(text: str) -> str: 请直接输出脱敏后的文本,无需解释或其他备注。 """) + return prompt.format(text=text) + +def get_masking_mapping_prompt(text: str) -> str: + """ + Returns a prompt that generates a mapping of original names/companies to their masked versions. + + Args: + text (str): The input text to be analyzed for masking + + Returns: + str: The formatted prompt that will generate a mapping dictionary + """ + prompt = textwrap.dedent(""" + 您是一位专业的法律文档脱敏专家。请分析文本并生成一个脱敏映射表,遵循以下规则: + + 规则: + 1. 人名映射规则: + - 对于同一姓氏的不同人名,使用字母区分: + * 第一个出现的用"姓+某"(如:张三 → 张某) + * 第二个出现的用"姓+某A"(如:张四 → 张某A) + * 第三个出现的用"姓+某B"(如:张五 → 张某B) + 依此类推 + - 三字名同样遵循此规则(如:张三丰 → 张某某,张四海 → 张某某A) + + 2. 公司名映射规则: + - 保留地理位置信息(如:北京、上海等) + - 保留公司类型(如:有限公司、股份公司等) + - 用"某"替换核心名称 + - 对于多个相似公司名,使用字母区分(如: + 北京智慧科技有限公司 → 北京某科技有限公司 + 北京智能科技有限公司 → 北京某科技有限公司A) + + 请分析以下文本,并生成一个JSON格式的映射表,包含所有需要脱敏的名称及其对应的脱敏后的形式: + + {text} + + 请直接输出JSON格式的映射表,格式如下: + {{ + "原文1": "脱敏后1", + "原文2": "脱敏后2", + ... + }} + """) + return prompt.format(text=text) \ No newline at end of file