实现docx转md
This commit is contained in:
parent
0f158c159b
commit
5abfa4998d
|
|
@ -1,20 +1,76 @@
|
||||||
|
import os
|
||||||
import docx
|
import docx
|
||||||
from document_handlers.document_processor import DocumentProcessor
|
from document_handlers.document_processor import DocumentProcessor
|
||||||
|
from magic_pdf.data.data_reader_writer import FileBasedDataWriter
|
||||||
|
from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
|
||||||
|
from magic_pdf.data.read_api import read_local_office
|
||||||
|
import logging
|
||||||
|
from services.ollama_client import OllamaClient
|
||||||
|
from config.settings import settings
|
||||||
|
from prompts.masking_prompts import get_masking_mapping_prompt
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
class DocxDocumentProcessor(DocumentProcessor):
|
class DocxDocumentProcessor(DocumentProcessor):
|
||||||
def __init__(self, input_path: str, output_path: str):
|
def __init__(self, input_path: str, output_path: str):
|
||||||
self.input_path = input_path
|
self.input_path = input_path
|
||||||
self.output_path = output_path
|
self.output_path = output_path
|
||||||
|
self.output_dir = os.path.dirname(output_path)
|
||||||
|
self.name_without_suff = os.path.splitext(os.path.basename(input_path))[0]
|
||||||
|
|
||||||
|
# Setup output directories
|
||||||
|
self.local_image_dir = os.path.join(self.output_dir, "images")
|
||||||
|
self.image_dir = os.path.basename(self.local_image_dir)
|
||||||
|
os.makedirs(self.local_image_dir, exist_ok=True)
|
||||||
|
|
||||||
|
self.ollama_client = OllamaClient(model_name=settings.OLLAMA_MODEL, base_url=settings.OLLAMA_API_URL)
|
||||||
|
|
||||||
def read_content(self) -> str:
|
def read_content(self) -> str:
|
||||||
doc = docx.Document(self.input_path)
|
try:
|
||||||
return '\n'.join([paragraph.text for paragraph in doc.paragraphs])
|
# Initialize writers
|
||||||
|
image_writer = FileBasedDataWriter(self.local_image_dir)
|
||||||
|
md_writer = FileBasedDataWriter(self.output_dir)
|
||||||
|
|
||||||
|
# Create Dataset Instance and process
|
||||||
|
ds = read_local_office(self.input_path)[0]
|
||||||
|
pipe_result = ds.apply(doc_analyze, ocr=True).pipe_txt_mode(image_writer)
|
||||||
|
|
||||||
|
# Generate markdown
|
||||||
|
md_content = pipe_result.get_markdown(self.image_dir)
|
||||||
|
pipe_result.dump_md(md_writer, f"{self.name_without_suff}.md", self.image_dir)
|
||||||
|
|
||||||
|
return md_content
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Error converting DOCX to MD: {e}")
|
||||||
|
raise
|
||||||
|
|
||||||
def process_content(self, content: str) -> str:
|
def process_content(self, content: str) -> str:
|
||||||
# Implementation for processing docx content
|
logger.info("Processing DOCX content")
|
||||||
return content
|
|
||||||
|
# Split content into sentences and apply masking
|
||||||
|
sentences = content.split("。")
|
||||||
|
final_md = ""
|
||||||
|
for sentence in sentences:
|
||||||
|
if sentence.strip(): # Only process non-empty sentences
|
||||||
|
formatted_prompt = get_masking_mapping_prompt(sentence)
|
||||||
|
logger.info("Calling ollama to generate response, prompt: %s", formatted_prompt)
|
||||||
|
response = self.ollama_client.generate(formatted_prompt)
|
||||||
|
logger.info(f"Response generated: {response}")
|
||||||
|
final_md += response + "。"
|
||||||
|
|
||||||
|
return final_md
|
||||||
|
|
||||||
def save_content(self, content: str) -> None:
|
def save_content(self, content: str) -> None:
|
||||||
doc = docx.Document()
|
# Ensure output path has .md extension
|
||||||
doc.add_paragraph(content)
|
output_dir = os.path.dirname(self.output_path)
|
||||||
doc.save(self.output_path)
|
base_name = os.path.splitext(os.path.basename(self.output_path))[0]
|
||||||
|
md_output_path = os.path.join(output_dir, f"{base_name}.md")
|
||||||
|
|
||||||
|
logger.info(f"Saving masked content to: {md_output_path}")
|
||||||
|
try:
|
||||||
|
with open(md_output_path, 'w', encoding='utf-8') as file:
|
||||||
|
file.write(content)
|
||||||
|
logger.info(f"Successfully saved content to {md_output_path}")
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Error saving content: {e}")
|
||||||
|
raise
|
||||||
Loading…
Reference in New Issue