实现docx转md
This commit is contained in:
parent
0f158c159b
commit
5abfa4998d
|
|
@ -1,20 +1,76 @@
|
|||
import os
|
||||
import docx
|
||||
from document_handlers.document_processor import DocumentProcessor
|
||||
from magic_pdf.data.data_reader_writer import FileBasedDataWriter
|
||||
from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
|
||||
from magic_pdf.data.read_api import read_local_office
|
||||
import logging
|
||||
from services.ollama_client import OllamaClient
|
||||
from config.settings import settings
|
||||
from prompts.masking_prompts import get_masking_mapping_prompt
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
class DocxDocumentProcessor(DocumentProcessor):
|
||||
def __init__(self, input_path: str, output_path: str):
|
||||
self.input_path = input_path
|
||||
self.output_path = output_path
|
||||
self.output_dir = os.path.dirname(output_path)
|
||||
self.name_without_suff = os.path.splitext(os.path.basename(input_path))[0]
|
||||
|
||||
# Setup output directories
|
||||
self.local_image_dir = os.path.join(self.output_dir, "images")
|
||||
self.image_dir = os.path.basename(self.local_image_dir)
|
||||
os.makedirs(self.local_image_dir, exist_ok=True)
|
||||
|
||||
self.ollama_client = OllamaClient(model_name=settings.OLLAMA_MODEL, base_url=settings.OLLAMA_API_URL)
|
||||
|
||||
def read_content(self) -> str:
|
||||
doc = docx.Document(self.input_path)
|
||||
return '\n'.join([paragraph.text for paragraph in doc.paragraphs])
|
||||
try:
|
||||
# Initialize writers
|
||||
image_writer = FileBasedDataWriter(self.local_image_dir)
|
||||
md_writer = FileBasedDataWriter(self.output_dir)
|
||||
|
||||
# Create Dataset Instance and process
|
||||
ds = read_local_office(self.input_path)[0]
|
||||
pipe_result = ds.apply(doc_analyze, ocr=True).pipe_txt_mode(image_writer)
|
||||
|
||||
# Generate markdown
|
||||
md_content = pipe_result.get_markdown(self.image_dir)
|
||||
pipe_result.dump_md(md_writer, f"{self.name_without_suff}.md", self.image_dir)
|
||||
|
||||
return md_content
|
||||
except Exception as e:
|
||||
logger.error(f"Error converting DOCX to MD: {e}")
|
||||
raise
|
||||
|
||||
def process_content(self, content: str) -> str:
|
||||
# Implementation for processing docx content
|
||||
return content
|
||||
logger.info("Processing DOCX content")
|
||||
|
||||
# Split content into sentences and apply masking
|
||||
sentences = content.split("。")
|
||||
final_md = ""
|
||||
for sentence in sentences:
|
||||
if sentence.strip(): # Only process non-empty sentences
|
||||
formatted_prompt = get_masking_mapping_prompt(sentence)
|
||||
logger.info("Calling ollama to generate response, prompt: %s", formatted_prompt)
|
||||
response = self.ollama_client.generate(formatted_prompt)
|
||||
logger.info(f"Response generated: {response}")
|
||||
final_md += response + "。"
|
||||
|
||||
return final_md
|
||||
|
||||
def save_content(self, content: str) -> None:
|
||||
doc = docx.Document()
|
||||
doc.add_paragraph(content)
|
||||
doc.save(self.output_path)
|
||||
# Ensure output path has .md extension
|
||||
output_dir = os.path.dirname(self.output_path)
|
||||
base_name = os.path.splitext(os.path.basename(self.output_path))[0]
|
||||
md_output_path = os.path.join(output_dir, f"{base_name}.md")
|
||||
|
||||
logger.info(f"Saving masked content to: {md_output_path}")
|
||||
try:
|
||||
with open(md_output_path, 'w', encoding='utf-8') as file:
|
||||
file.write(content)
|
||||
logger.info(f"Successfully saved content to {md_output_path}")
|
||||
except Exception as e:
|
||||
logger.error(f"Error saving content: {e}")
|
||||
raise
|
||||
Loading…
Reference in New Issue