diff --git a/src/document_handlers/processors/pdf_processor.py b/src/document_handlers/processors/pdf_processor.py index 8bbfec4..bbd4e7c 100644 --- a/src/document_handlers/processors/pdf_processor.py +++ b/src/document_handlers/processors/pdf_processor.py @@ -18,14 +18,26 @@ class PdfDocumentProcessor(DocumentProcessor): self.image_dir = os.path.basename(self.local_image_dir) os.makedirs(self.local_image_dir, exist_ok=True) + # Setup work directory under output directory + self.work_dir = os.path.join( + os.path.dirname(output_path), + ".work", + os.path.splitext(os.path.basename(input_path))[0] + ) + os.makedirs(self.work_dir, exist_ok=True) + + self.work_local_image_dir = os.path.join(self.work_dir, "images") + self.work_image_dir = os.path.basename(self.work_local_image_dir) + os.makedirs(self.work_local_image_dir, exist_ok=True) + def read_content(self) -> bytes: with open(self.input_path, 'rb') as file: return file.read() def process_content(self, content: bytes) -> dict: # Initialize writers - image_writer = FileBasedDataWriter(self.local_image_dir) - md_writer = FileBasedDataWriter(self.output_dir) + image_writer = FileBasedDataWriter(self.work_local_image_dir) + md_writer = FileBasedDataWriter(self.work_dir) # Create Dataset Instance ds = PymuDocDataset(content) @@ -39,17 +51,17 @@ class PdfDocumentProcessor(DocumentProcessor): pipe_result = infer_result.pipe_txt_mode(image_writer) # Generate all outputs - infer_result.draw_model(os.path.join(self.output_dir, f"{self.name_without_suff}_model.pdf")) + infer_result.draw_model(os.path.join(self.work_dir, f"{self.name_without_suff}_model.pdf")) model_inference_result = infer_result.get_infer_res() - pipe_result.draw_layout(os.path.join(self.output_dir, f"{self.name_without_suff}_layout.pdf")) - pipe_result.draw_span(os.path.join(self.output_dir, f"{self.name_without_suff}_spans.pdf")) + pipe_result.draw_layout(os.path.join(self.work_dir, f"{self.name_without_suff}_layout.pdf")) + pipe_result.draw_span(os.path.join(self.work_dir, f"{self.name_without_suff}_spans.pdf")) - md_content = pipe_result.get_markdown(self.image_dir) - pipe_result.dump_md(md_writer, f"{self.name_without_suff}.md", self.image_dir) + md_content = pipe_result.get_markdown(self.work_image_dir) + pipe_result.dump_md(md_writer, f"{self.name_without_suff}.md", self.work_image_dir) - content_list = pipe_result.get_content_list(self.image_dir) - pipe_result.dump_content_list(md_writer, f"{self.name_without_suff}_content_list.json", self.image_dir) + content_list = pipe_result.get_content_list(self.work_image_dir) + pipe_result.dump_content_list(md_writer, f"{self.name_without_suff}_content_list.json", self.work_image_dir) middle_json = pipe_result.get_middle_json() pipe_result.dump_middle_json(md_writer, f'{self.name_without_suff}_middle.json')