From 5abfa4998ddf20143c321c6366c8ac258d493bc8 Mon Sep 17 00:00:00 2001 From: oliviamn Date: Wed, 21 May 2025 00:15:01 +0800 Subject: [PATCH] =?UTF-8?q?=E5=AE=9E=E7=8E=B0docx=E8=BD=ACmd?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../processors/docx_processor.py | 70 +++++++++++++++++-- 1 file changed, 63 insertions(+), 7 deletions(-) diff --git a/src/document_handlers/processors/docx_processor.py b/src/document_handlers/processors/docx_processor.py index 780caf3..77de199 100644 --- a/src/document_handlers/processors/docx_processor.py +++ b/src/document_handlers/processors/docx_processor.py @@ -1,20 +1,76 @@ +import os import docx from document_handlers.document_processor import DocumentProcessor +from magic_pdf.data.data_reader_writer import FileBasedDataWriter +from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze +from magic_pdf.data.read_api import read_local_office +import logging +from services.ollama_client import OllamaClient +from config.settings import settings +from prompts.masking_prompts import get_masking_mapping_prompt + +logger = logging.getLogger(__name__) class DocxDocumentProcessor(DocumentProcessor): def __init__(self, input_path: str, output_path: str): self.input_path = input_path self.output_path = output_path + self.output_dir = os.path.dirname(output_path) + self.name_without_suff = os.path.splitext(os.path.basename(input_path))[0] + + # Setup output directories + self.local_image_dir = os.path.join(self.output_dir, "images") + self.image_dir = os.path.basename(self.local_image_dir) + os.makedirs(self.local_image_dir, exist_ok=True) + + self.ollama_client = OllamaClient(model_name=settings.OLLAMA_MODEL, base_url=settings.OLLAMA_API_URL) def read_content(self) -> str: - doc = docx.Document(self.input_path) - return '\n'.join([paragraph.text for paragraph in doc.paragraphs]) + try: + # Initialize writers + image_writer = FileBasedDataWriter(self.local_image_dir) + md_writer = FileBasedDataWriter(self.output_dir) + + # Create Dataset Instance and process + ds = read_local_office(self.input_path)[0] + pipe_result = ds.apply(doc_analyze, ocr=True).pipe_txt_mode(image_writer) + + # Generate markdown + md_content = pipe_result.get_markdown(self.image_dir) + pipe_result.dump_md(md_writer, f"{self.name_without_suff}.md", self.image_dir) + + return md_content + except Exception as e: + logger.error(f"Error converting DOCX to MD: {e}") + raise def process_content(self, content: str) -> str: - # Implementation for processing docx content - return content + logger.info("Processing DOCX content") + + # Split content into sentences and apply masking + sentences = content.split("。") + final_md = "" + for sentence in sentences: + if sentence.strip(): # Only process non-empty sentences + formatted_prompt = get_masking_mapping_prompt(sentence) + logger.info("Calling ollama to generate response, prompt: %s", formatted_prompt) + response = self.ollama_client.generate(formatted_prompt) + logger.info(f"Response generated: {response}") + final_md += response + "。" + + return final_md def save_content(self, content: str) -> None: - doc = docx.Document() - doc.add_paragraph(content) - doc.save(self.output_path) \ No newline at end of file + # Ensure output path has .md extension + output_dir = os.path.dirname(self.output_path) + base_name = os.path.splitext(os.path.basename(self.output_path))[0] + md_output_path = os.path.join(output_dir, f"{base_name}.md") + + logger.info(f"Saving masked content to: {md_output_path}") + try: + with open(md_output_path, 'w', encoding='utf-8') as file: + file.write(content) + logger.info(f"Successfully saved content to {md_output_path}") + except Exception as e: + logger.error(f"Error saving content: {e}") + raise \ No newline at end of file