import os import PyPDF2 from models.document_processor import DocumentProcessor from magic_pdf.data.data_reader_writer import FileBasedDataWriter, FileBasedDataReader from magic_pdf.data.dataset import PymuDocDataset from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze from magic_pdf.config.enums import SupportedPdfParseMethod class PdfDocumentProcessor(DocumentProcessor): def __init__(self, input_path: str, output_path: str): self.input_path = input_path self.output_path = output_path self.output_dir = os.path.dirname(output_path) self.name_without_suff = os.path.splitext(os.path.basename(input_path))[0] # Setup output directories self.local_image_dir = os.path.join(self.output_dir, "images") self.image_dir = os.path.basename(self.local_image_dir) os.makedirs(self.local_image_dir, exist_ok=True) def read_content(self) -> bytes: with open(self.input_path, 'rb') as file: return file.read() def process_content(self, content: bytes) -> dict: # Initialize writers image_writer = FileBasedDataWriter(self.local_image_dir) md_writer = FileBasedDataWriter(self.output_dir) # Create Dataset Instance ds = PymuDocDataset(content) # Process based on PDF type if ds.classify() == SupportedPdfParseMethod.OCR: infer_result = ds.apply(doc_analyze, ocr=True) pipe_result = infer_result.pipe_ocr_mode(image_writer) else: infer_result = ds.apply(doc_analyze, ocr=False) pipe_result = infer_result.pipe_txt_mode(image_writer) # Generate all outputs infer_result.draw_model(os.path.join(self.output_dir, f"{self.name_without_suff}_model.pdf")) model_inference_result = infer_result.get_infer_res() pipe_result.draw_layout(os.path.join(self.output_dir, f"{self.name_without_suff}_layout.pdf")) pipe_result.draw_span(os.path.join(self.output_dir, f"{self.name_without_suff}_spans.pdf")) md_content = pipe_result.get_markdown(self.image_dir) pipe_result.dump_md(md_writer, f"{self.name_without_suff}.md", self.image_dir) content_list = pipe_result.get_content_list(self.image_dir) pipe_result.dump_content_list(md_writer, f"{self.name_without_suff}_content_list.json", self.image_dir) middle_json = pipe_result.get_middle_json() pipe_result.dump_middle_json(md_writer, f'{self.name_without_suff}_middle.json') return md_content return { 'markdown': md_content, 'content_list': content_list, 'middle_json': middle_json, 'model_inference': model_inference_result } def save_content(self, content: dict) -> None: # Content is already saved during processing with open(self.output_path, 'w', encoding='utf-8') as file: file.write(content)