实现docx转md

This commit is contained in:
oliviamn 2025-05-21 00:15:01 +08:00
parent 0f158c159b
commit 5abfa4998d
1 changed files with 63 additions and 7 deletions

View File

@ -1,20 +1,76 @@
import os
import docx
from document_handlers.document_processor import DocumentProcessor
from magic_pdf.data.data_reader_writer import FileBasedDataWriter
from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
from magic_pdf.data.read_api import read_local_office
import logging
from services.ollama_client import OllamaClient
from config.settings import settings
from prompts.masking_prompts import get_masking_mapping_prompt
logger = logging.getLogger(__name__)
class DocxDocumentProcessor(DocumentProcessor):
def __init__(self, input_path: str, output_path: str):
self.input_path = input_path
self.output_path = output_path
self.output_dir = os.path.dirname(output_path)
self.name_without_suff = os.path.splitext(os.path.basename(input_path))[0]
# Setup output directories
self.local_image_dir = os.path.join(self.output_dir, "images")
self.image_dir = os.path.basename(self.local_image_dir)
os.makedirs(self.local_image_dir, exist_ok=True)
self.ollama_client = OllamaClient(model_name=settings.OLLAMA_MODEL, base_url=settings.OLLAMA_API_URL)
def read_content(self) -> str:
doc = docx.Document(self.input_path)
return '\n'.join([paragraph.text for paragraph in doc.paragraphs])
try:
# Initialize writers
image_writer = FileBasedDataWriter(self.local_image_dir)
md_writer = FileBasedDataWriter(self.output_dir)
# Create Dataset Instance and process
ds = read_local_office(self.input_path)[0]
pipe_result = ds.apply(doc_analyze, ocr=True).pipe_txt_mode(image_writer)
# Generate markdown
md_content = pipe_result.get_markdown(self.image_dir)
pipe_result.dump_md(md_writer, f"{self.name_without_suff}.md", self.image_dir)
return md_content
except Exception as e:
logger.error(f"Error converting DOCX to MD: {e}")
raise
def process_content(self, content: str) -> str:
# Implementation for processing docx content
return content
logger.info("Processing DOCX content")
# Split content into sentences and apply masking
sentences = content.split("")
final_md = ""
for sentence in sentences:
if sentence.strip(): # Only process non-empty sentences
formatted_prompt = get_masking_mapping_prompt(sentence)
logger.info("Calling ollama to generate response, prompt: %s", formatted_prompt)
response = self.ollama_client.generate(formatted_prompt)
logger.info(f"Response generated: {response}")
final_md += response + ""
return final_md
def save_content(self, content: str) -> None:
doc = docx.Document()
doc.add_paragraph(content)
doc.save(self.output_path)
# Ensure output path has .md extension
output_dir = os.path.dirname(self.output_path)
base_name = os.path.splitext(os.path.basename(self.output_path))[0]
md_output_path = os.path.join(output_dir, f"{base_name}.md")
logger.info(f"Saving masked content to: {md_output_path}")
try:
with open(md_output_path, 'w', encoding='utf-8') as file:
file.write(content)
logger.info(f"Successfully saved content to {md_output_path}")
except Exception as e:
logger.error(f"Error saving content: {e}")
raise