From edca9a87a0b2bc1eef1d2726eb15e68523b4379f Mon Sep 17 00:00:00 2001 From: oliviamn Date: Mon, 5 May 2025 19:15:03 +0800 Subject: [PATCH] Refactor PdfDocumentProcessor to enhance PDF content processing - Updated read_content method to return raw bytes instead of extracted text. - Modified process_content method to handle bytes and generate multiple output files including markdown, JSON, and processed PDFs. - Implemented directory setup for image storage and output management. - Integrated PymuDocDataset for PDF classification and processing based on OCR capabilities. --- src/models/processors/pdf_processor.py | 67 ++++++++++++++++++++++---- 1 file changed, 58 insertions(+), 9 deletions(-) diff --git a/src/models/processors/pdf_processor.py b/src/models/processors/pdf_processor.py index 4d73d54..7ffb326 100644 --- a/src/models/processors/pdf_processor.py +++ b/src/models/processors/pdf_processor.py @@ -1,20 +1,69 @@ +import os import PyPDF2 from models.document_processor import DocumentProcessor +from magic_pdf.data.data_reader_writer import FileBasedDataWriter, FileBasedDataReader +from magic_pdf.data.dataset import PymuDocDataset +from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze +from magic_pdf.config.enums import SupportedPdfParseMethod class PdfDocumentProcessor(DocumentProcessor): def __init__(self, input_path: str, output_path: str): self.input_path = input_path self.output_path = output_path + self.output_dir = os.path.dirname(output_path) + self.name_without_suff = os.path.splitext(os.path.basename(input_path))[0] + + # Setup output directories + self.local_image_dir = os.path.join(self.output_dir, "images") + self.image_dir = os.path.basename(self.local_image_dir) + os.makedirs(self.local_image_dir, exist_ok=True) - def read_content(self) -> str: + def read_content(self) -> bytes: with open(self.input_path, 'rb') as file: - pdf_reader = PyPDF2.PdfReader(file) - return ' '.join([page.extract_text() for page in pdf_reader.pages]) + return file.read() - def process_content(self, content: str) -> str: - # Implementation for processing PDF content - return content + def process_content(self, content: bytes) -> dict: + # Initialize writers + image_writer = FileBasedDataWriter(self.local_image_dir) + md_writer = FileBasedDataWriter(self.output_dir) - def save_content(self, content: str) -> None: - # Implementation for saving as PDF - pass \ No newline at end of file + # Create Dataset Instance + ds = PymuDocDataset(content) + + # Process based on PDF type + if ds.classify() == SupportedPdfParseMethod.OCR: + infer_result = ds.apply(doc_analyze, ocr=True) + pipe_result = infer_result.pipe_ocr_mode(image_writer) + else: + infer_result = ds.apply(doc_analyze, ocr=False) + pipe_result = infer_result.pipe_txt_mode(image_writer) + + # Generate all outputs + infer_result.draw_model(os.path.join(self.output_dir, f"{self.name_without_suff}_model.pdf")) + model_inference_result = infer_result.get_infer_res() + + pipe_result.draw_layout(os.path.join(self.output_dir, f"{self.name_without_suff}_layout.pdf")) + pipe_result.draw_span(os.path.join(self.output_dir, f"{self.name_without_suff}_spans.pdf")) + + md_content = pipe_result.get_markdown(self.image_dir) + pipe_result.dump_md(md_writer, f"{self.name_without_suff}.md", self.image_dir) + + content_list = pipe_result.get_content_list(self.image_dir) + pipe_result.dump_content_list(md_writer, f"{self.name_without_suff}_content_list.json", self.image_dir) + + middle_json = pipe_result.get_middle_json() + pipe_result.dump_middle_json(md_writer, f'{self.name_without_suff}_middle.json') + + return md_content + + return { + 'markdown': md_content, + 'content_list': content_list, + 'middle_json': middle_json, + 'model_inference': model_inference_result + } + + def save_content(self, content: dict) -> None: + # Content is already saved during processing + with open(self.output_path, 'w', encoding='utf-8') as file: + file.write(content) \ No newline at end of file