From 7d0be5aa8a3682471098afa8021b6ebd060cd188 Mon Sep 17 00:00:00 2001 From: oliviamn Date: Tue, 6 May 2025 00:13:19 +0800 Subject: [PATCH] =?UTF-8?q?=E5=B0=86=E9=A2=98=E8=AF=8D=E6=8A=BD=E8=B1=A1?= =?UTF-8?q?=E5=87=BA=E6=9D=A5?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../processors/pdf_processor.py | 39 +++++++++++++------ .../processors/txt_processor.py | 23 +---------- src/prompts/masking_prompts.py | 33 ++++++++++++++++ 3 files changed, 63 insertions(+), 32 deletions(-) create mode 100644 src/prompts/masking_prompts.py diff --git a/src/document_handlers/processors/pdf_processor.py b/src/document_handlers/processors/pdf_processor.py index bbd4e7c..e174a23 100644 --- a/src/document_handlers/processors/pdf_processor.py +++ b/src/document_handlers/processors/pdf_processor.py @@ -5,6 +5,12 @@ from magic_pdf.data.data_reader_writer import FileBasedDataWriter, FileBasedData from magic_pdf.data.dataset import PymuDocDataset from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze from magic_pdf.config.enums import SupportedPdfParseMethod +from prompts.masking_prompts import get_masking_prompt +import logging +from services.ollama_client import OllamaClient +from config.settings import settings + +logger = logging.getLogger(__name__) class PdfDocumentProcessor(DocumentProcessor): def __init__(self, input_path: str, output_path: str): @@ -29,12 +35,16 @@ class PdfDocumentProcessor(DocumentProcessor): self.work_local_image_dir = os.path.join(self.work_dir, "images") self.work_image_dir = os.path.basename(self.work_local_image_dir) os.makedirs(self.work_local_image_dir, exist_ok=True) + self.ollama_client = OllamaClient(model_name=settings.OLLAMA_MODEL, base_url=settings.OLLAMA_API_URL) + def read_content(self) -> bytes: with open(self.input_path, 'rb') as file: return file.read() def process_content(self, content: bytes) -> dict: + logger.info("Starting PDF content processing") + # Initialize writers image_writer = FileBasedDataWriter(self.work_local_image_dir) md_writer = FileBasedDataWriter(self.work_dir) @@ -42,6 +52,7 @@ class PdfDocumentProcessor(DocumentProcessor): # Create Dataset Instance ds = PymuDocDataset(content) + logger.info("Classifying PDF type: %s", ds.classify()) # Process based on PDF type if ds.classify() == SupportedPdfParseMethod.OCR: infer_result = ds.apply(doc_analyze, ocr=True) @@ -49,7 +60,8 @@ class PdfDocumentProcessor(DocumentProcessor): else: infer_result = ds.apply(doc_analyze, ocr=False) pipe_result = infer_result.pipe_txt_mode(image_writer) - + + logger.info("Generating all outputs") # Generate all outputs infer_result.draw_model(os.path.join(self.work_dir, f"{self.name_without_suff}_model.pdf")) model_inference_result = infer_result.get_infer_res() @@ -66,16 +78,21 @@ class PdfDocumentProcessor(DocumentProcessor): middle_json = pipe_result.get_middle_json() pipe_result.dump_middle_json(md_writer, f'{self.name_without_suff}_middle.json') - return md_content + logger.info("Masking content") + formatted_prompt = get_masking_prompt(md_content) + logger.info("Calling ollama to generate response") + response = self.ollama_client.generate(formatted_prompt) + logger.info("Response generated") + return response - return { - 'markdown': md_content, - 'content_list': content_list, - 'middle_json': middle_json, - 'model_inference': model_inference_result - } - def save_content(self, content: dict) -> None: - # Content is already saved during processing - with open(self.output_path, 'w', encoding='utf-8') as file: + + def save_content(self, content: str) -> None: + # Ensure output path has .md extension + output_dir = os.path.dirname(self.output_path) + base_name = os.path.splitext(os.path.basename(self.output_path))[0] + md_output_path = os.path.join(output_dir, f"{base_name}.md") + + logger.info(f"Saving masked content to: {md_output_path}") + with open(md_output_path, 'w', encoding='utf-8') as file: file.write(content) \ No newline at end of file diff --git a/src/document_handlers/processors/txt_processor.py b/src/document_handlers/processors/txt_processor.py index c5e5f9a..b6afad1 100644 --- a/src/document_handlers/processors/txt_processor.py +++ b/src/document_handlers/processors/txt_processor.py @@ -1,7 +1,7 @@ from document_handlers.document_processor import DocumentProcessor from services.ollama_client import OllamaClient -import textwrap import logging +from prompts.masking_prompts import get_masking_prompt from config.settings import settings logger = logging.getLogger(__name__) @@ -16,27 +16,8 @@ class TxtDocumentProcessor(DocumentProcessor): return file.read() def process_content(self, content: str) -> str: - prompt = textwrap.dedent(""" - 您是一位专业的法律文档脱敏专家。请按照以下规则对文本进行脱敏处理: - 规则: - 1. 人名: - - 两字名改为"姓+某"(如:张三 → 张某) - - 三字名改为"姓+某某"(如:张三丰 → 张某某) - 2. 公司名: - - 保留地理位置信息(如:北京、上海等) - - 保留公司类型(如:有限公司、股份公司等) - - 用"某"替换核心名称 - 3. 保持原文其他部分不变 - 4. 确保脱敏后的文本保持原有的语言流畅性和可读性 - - 输入文本: - {text} - - 请直接输出脱敏后的文本,无需解释或其他备注。 - """) - - formatted_prompt = prompt.format(text=content) + formatted_prompt = get_masking_prompt(content) response = self.ollama_client.generate(formatted_prompt) logger.debug(f"Processed content: {response}") return response diff --git a/src/prompts/masking_prompts.py b/src/prompts/masking_prompts.py new file mode 100644 index 0000000..98a610a --- /dev/null +++ b/src/prompts/masking_prompts.py @@ -0,0 +1,33 @@ +import textwrap + +def get_masking_prompt(text: str) -> str: + """ + Returns the prompt for masking sensitive information in legal documents. + + Args: + text (str): The input text to be masked + + Returns: + str: The formatted prompt with the input text + """ + prompt = textwrap.dedent(""" + 您是一位专业的法律文档脱敏专家。请按照以下规则对文本进行脱敏处理: + + 规则: + 1. 人名: + - 两字名改为"姓+某"(如:张三 → 张某) + - 三字名改为"姓+某某"(如:张三丰 → 张某某) + 2. 公司名: + - 保留地理位置信息(如:北京、上海等) + - 保留公司类型(如:有限公司、股份公司等) + - 用"某"替换核心名称 + 3. 保持原文其他部分不变 + 4. 确保脱敏后的文本保持原有的语言流畅性和可读性 + + 输入文本: + {text} + + 请直接输出脱敏后的文本,无需解释或其他备注。 + """) + + return prompt.format(text=text) \ No newline at end of file