文件写入output folder的.work隐藏目录下
This commit is contained in:
parent
e6fb9b9a83
commit
815427a509
|
|
@ -18,14 +18,26 @@ class PdfDocumentProcessor(DocumentProcessor):
|
||||||
self.image_dir = os.path.basename(self.local_image_dir)
|
self.image_dir = os.path.basename(self.local_image_dir)
|
||||||
os.makedirs(self.local_image_dir, exist_ok=True)
|
os.makedirs(self.local_image_dir, exist_ok=True)
|
||||||
|
|
||||||
|
# Setup work directory under output directory
|
||||||
|
self.work_dir = os.path.join(
|
||||||
|
os.path.dirname(output_path),
|
||||||
|
".work",
|
||||||
|
os.path.splitext(os.path.basename(input_path))[0]
|
||||||
|
)
|
||||||
|
os.makedirs(self.work_dir, exist_ok=True)
|
||||||
|
|
||||||
|
self.work_local_image_dir = os.path.join(self.work_dir, "images")
|
||||||
|
self.work_image_dir = os.path.basename(self.work_local_image_dir)
|
||||||
|
os.makedirs(self.work_local_image_dir, exist_ok=True)
|
||||||
|
|
||||||
def read_content(self) -> bytes:
|
def read_content(self) -> bytes:
|
||||||
with open(self.input_path, 'rb') as file:
|
with open(self.input_path, 'rb') as file:
|
||||||
return file.read()
|
return file.read()
|
||||||
|
|
||||||
def process_content(self, content: bytes) -> dict:
|
def process_content(self, content: bytes) -> dict:
|
||||||
# Initialize writers
|
# Initialize writers
|
||||||
image_writer = FileBasedDataWriter(self.local_image_dir)
|
image_writer = FileBasedDataWriter(self.work_local_image_dir)
|
||||||
md_writer = FileBasedDataWriter(self.output_dir)
|
md_writer = FileBasedDataWriter(self.work_dir)
|
||||||
|
|
||||||
# Create Dataset Instance
|
# Create Dataset Instance
|
||||||
ds = PymuDocDataset(content)
|
ds = PymuDocDataset(content)
|
||||||
|
|
@ -39,17 +51,17 @@ class PdfDocumentProcessor(DocumentProcessor):
|
||||||
pipe_result = infer_result.pipe_txt_mode(image_writer)
|
pipe_result = infer_result.pipe_txt_mode(image_writer)
|
||||||
|
|
||||||
# Generate all outputs
|
# Generate all outputs
|
||||||
infer_result.draw_model(os.path.join(self.output_dir, f"{self.name_without_suff}_model.pdf"))
|
infer_result.draw_model(os.path.join(self.work_dir, f"{self.name_without_suff}_model.pdf"))
|
||||||
model_inference_result = infer_result.get_infer_res()
|
model_inference_result = infer_result.get_infer_res()
|
||||||
|
|
||||||
pipe_result.draw_layout(os.path.join(self.output_dir, f"{self.name_without_suff}_layout.pdf"))
|
pipe_result.draw_layout(os.path.join(self.work_dir, f"{self.name_without_suff}_layout.pdf"))
|
||||||
pipe_result.draw_span(os.path.join(self.output_dir, f"{self.name_without_suff}_spans.pdf"))
|
pipe_result.draw_span(os.path.join(self.work_dir, f"{self.name_without_suff}_spans.pdf"))
|
||||||
|
|
||||||
md_content = pipe_result.get_markdown(self.image_dir)
|
md_content = pipe_result.get_markdown(self.work_image_dir)
|
||||||
pipe_result.dump_md(md_writer, f"{self.name_without_suff}.md", self.image_dir)
|
pipe_result.dump_md(md_writer, f"{self.name_without_suff}.md", self.work_image_dir)
|
||||||
|
|
||||||
content_list = pipe_result.get_content_list(self.image_dir)
|
content_list = pipe_result.get_content_list(self.work_image_dir)
|
||||||
pipe_result.dump_content_list(md_writer, f"{self.name_without_suff}_content_list.json", self.image_dir)
|
pipe_result.dump_content_list(md_writer, f"{self.name_without_suff}_content_list.json", self.work_image_dir)
|
||||||
|
|
||||||
middle_json = pipe_result.get_middle_json()
|
middle_json = pipe_result.get_middle_json()
|
||||||
pipe_result.dump_middle_json(md_writer, f'{self.name_without_suff}_middle.json')
|
pipe_result.dump_middle_json(md_writer, f'{self.name_without_suff}_middle.json')
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue