Refactor PdfDocumentProcessor to enhance PDF content processing
- Updated read_content method to return raw bytes instead of extracted text. - Modified process_content method to handle bytes and generate multiple output files including markdown, JSON, and processed PDFs. - Implemented directory setup for image storage and output management. - Integrated PymuDocDataset for PDF classification and processing based on OCR capabilities.
This commit is contained in:
parent
6acf3e5423
commit
edca9a87a0
|
|
@ -1,20 +1,69 @@
|
||||||
|
import os
|
||||||
import PyPDF2
|
import PyPDF2
|
||||||
from models.document_processor import DocumentProcessor
|
from models.document_processor import DocumentProcessor
|
||||||
|
from magic_pdf.data.data_reader_writer import FileBasedDataWriter, FileBasedDataReader
|
||||||
|
from magic_pdf.data.dataset import PymuDocDataset
|
||||||
|
from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
|
||||||
|
from magic_pdf.config.enums import SupportedPdfParseMethod
|
||||||
|
|
||||||
class PdfDocumentProcessor(DocumentProcessor):
|
class PdfDocumentProcessor(DocumentProcessor):
|
||||||
def __init__(self, input_path: str, output_path: str):
|
def __init__(self, input_path: str, output_path: str):
|
||||||
self.input_path = input_path
|
self.input_path = input_path
|
||||||
self.output_path = output_path
|
self.output_path = output_path
|
||||||
|
self.output_dir = os.path.dirname(output_path)
|
||||||
|
self.name_without_suff = os.path.splitext(os.path.basename(input_path))[0]
|
||||||
|
|
||||||
|
# Setup output directories
|
||||||
|
self.local_image_dir = os.path.join(self.output_dir, "images")
|
||||||
|
self.image_dir = os.path.basename(self.local_image_dir)
|
||||||
|
os.makedirs(self.local_image_dir, exist_ok=True)
|
||||||
|
|
||||||
def read_content(self) -> str:
|
def read_content(self) -> bytes:
|
||||||
with open(self.input_path, 'rb') as file:
|
with open(self.input_path, 'rb') as file:
|
||||||
pdf_reader = PyPDF2.PdfReader(file)
|
return file.read()
|
||||||
return ' '.join([page.extract_text() for page in pdf_reader.pages])
|
|
||||||
|
|
||||||
def process_content(self, content: str) -> str:
|
def process_content(self, content: bytes) -> dict:
|
||||||
# Implementation for processing PDF content
|
# Initialize writers
|
||||||
return content
|
image_writer = FileBasedDataWriter(self.local_image_dir)
|
||||||
|
md_writer = FileBasedDataWriter(self.output_dir)
|
||||||
|
|
||||||
def save_content(self, content: str) -> None:
|
# Create Dataset Instance
|
||||||
# Implementation for saving as PDF
|
ds = PymuDocDataset(content)
|
||||||
pass
|
|
||||||
|
# Process based on PDF type
|
||||||
|
if ds.classify() == SupportedPdfParseMethod.OCR:
|
||||||
|
infer_result = ds.apply(doc_analyze, ocr=True)
|
||||||
|
pipe_result = infer_result.pipe_ocr_mode(image_writer)
|
||||||
|
else:
|
||||||
|
infer_result = ds.apply(doc_analyze, ocr=False)
|
||||||
|
pipe_result = infer_result.pipe_txt_mode(image_writer)
|
||||||
|
|
||||||
|
# Generate all outputs
|
||||||
|
infer_result.draw_model(os.path.join(self.output_dir, f"{self.name_without_suff}_model.pdf"))
|
||||||
|
model_inference_result = infer_result.get_infer_res()
|
||||||
|
|
||||||
|
pipe_result.draw_layout(os.path.join(self.output_dir, f"{self.name_without_suff}_layout.pdf"))
|
||||||
|
pipe_result.draw_span(os.path.join(self.output_dir, f"{self.name_without_suff}_spans.pdf"))
|
||||||
|
|
||||||
|
md_content = pipe_result.get_markdown(self.image_dir)
|
||||||
|
pipe_result.dump_md(md_writer, f"{self.name_without_suff}.md", self.image_dir)
|
||||||
|
|
||||||
|
content_list = pipe_result.get_content_list(self.image_dir)
|
||||||
|
pipe_result.dump_content_list(md_writer, f"{self.name_without_suff}_content_list.json", self.image_dir)
|
||||||
|
|
||||||
|
middle_json = pipe_result.get_middle_json()
|
||||||
|
pipe_result.dump_middle_json(md_writer, f'{self.name_without_suff}_middle.json')
|
||||||
|
|
||||||
|
return md_content
|
||||||
|
|
||||||
|
return {
|
||||||
|
'markdown': md_content,
|
||||||
|
'content_list': content_list,
|
||||||
|
'middle_json': middle_json,
|
||||||
|
'model_inference': model_inference_result
|
||||||
|
}
|
||||||
|
|
||||||
|
def save_content(self, content: dict) -> None:
|
||||||
|
# Content is already saved during processing
|
||||||
|
with open(self.output_path, 'w', encoding='utf-8') as file:
|
||||||
|
file.write(content)
|
||||||
Loading…
Reference in New Issue