legal-doc-masker/mineru/app/api/endpoints/mineru.py

330 lines
14 KiB
Python

from fastapi import APIRouter, HTTPException, UploadFile, File, BackgroundTasks
from fastapi.responses import FileResponse
from typing import List, Optional
import os
import tempfile
import shutil
import json
from pathlib import Path
import uuid
from loguru import logger
from ...core.config import settings
# Import mineru functions
from mineru.cli.common import convert_pdf_bytes_to_bytes_by_pypdfium2, prepare_env, read_fn
from mineru.data.data_reader_writer import FileBasedDataWriter
from mineru.utils.draw_bbox import draw_layout_bbox, draw_span_bbox
from mineru.utils.enum_class import MakeMode
from mineru.backend.vlm.vlm_analyze import doc_analyze as vlm_doc_analyze
from mineru.backend.pipeline.pipeline_analyze import doc_analyze as pipeline_doc_analyze
from mineru.backend.pipeline.pipeline_middle_json_mkcontent import union_make as pipeline_union_make
from mineru.backend.pipeline.model_json_to_middle_json import result_to_middle_json as pipeline_result_to_middle_json
from mineru.backend.vlm.vlm_middle_json_mkcontent import union_make as vlm_union_make
router = APIRouter()
class MineruParseRequest:
def __init__(
self,
lang: str = "ch",
backend: str = "pipeline",
method: str = "auto",
server_url: Optional[str] = None,
start_page_id: int = 0,
end_page_id: Optional[int] = None,
formula_enable: bool = True,
table_enable: bool = True,
draw_layout_bbox: bool = True,
draw_span_bbox: bool = True,
dump_md: bool = True,
dump_middle_json: bool = True,
dump_model_output: bool = True,
dump_orig_pdf: bool = True,
dump_content_list: bool = True,
make_md_mode: str = "MM_MD"
):
self.lang = lang
self.backend = backend
self.method = method
self.server_url = server_url
self.start_page_id = start_page_id
self.end_page_id = end_page_id
self.formula_enable = formula_enable
self.table_enable = table_enable
self.draw_layout_bbox = draw_layout_bbox
self.draw_span_bbox = draw_span_bbox
self.dump_md = dump_md
self.dump_middle_json = dump_middle_json
self.dump_model_output = dump_model_output
self.dump_orig_pdf = dump_orig_pdf
self.dump_content_list = dump_content_list
self.make_md_mode = MakeMode.MM_MD if make_md_mode == "MM_MD" else MakeMode.CONTENT_LIST
async def process_mineru_document(
file: UploadFile,
request: MineruParseRequest,
output_dir: Path
) -> dict:
"""Process a single document using Mineru"""
try:
# Read file content
content = await file.read()
# Create temporary file
with tempfile.NamedTemporaryFile(delete=False, suffix=Path(file.filename).suffix) as temp_file:
temp_file.write(content)
temp_file_path = Path(temp_file.name)
try:
# Prepare environment
file_name = Path(file.filename).stem
local_image_dir, local_md_dir = prepare_env(output_dir, file_name, request.method)
image_writer, md_writer = FileBasedDataWriter(local_image_dir), FileBasedDataWriter(local_md_dir)
# Convert PDF bytes if needed
if request.backend == "pipeline":
new_pdf_bytes = convert_pdf_bytes_to_bytes_by_pypdfium2(
content, request.start_page_id, request.end_page_id
)
# Analyze document
infer_results, all_image_lists, all_pdf_docs, lang_list, ocr_enabled_list = pipeline_doc_analyze(
[new_pdf_bytes], [request.lang],
parse_method=request.method,
formula_enable=request.formula_enable,
table_enable=request.table_enable
)
# Process results
model_list = infer_results[0]
images_list = all_image_lists[0]
pdf_doc = all_pdf_docs[0]
_lang = lang_list[0]
_ocr_enable = ocr_enabled_list[0]
middle_json = pipeline_result_to_middle_json(
model_list, images_list, pdf_doc, image_writer, _lang, _ocr_enable, request.formula_enable
)
pdf_info = middle_json["pdf_info"]
# Generate outputs
outputs = {}
if request.draw_layout_bbox:
draw_layout_bbox(pdf_info, new_pdf_bytes, local_md_dir, f"{file_name}_layout.pdf")
outputs["layout_pdf"] = str(local_md_dir / f"{file_name}_layout.pdf")
if request.draw_span_bbox:
draw_span_bbox(pdf_info, new_pdf_bytes, local_md_dir, f"{file_name}_span.pdf")
outputs["span_pdf"] = str(local_md_dir / f"{file_name}_span.pdf")
if request.dump_orig_pdf:
md_writer.write(f"{file_name}_origin.pdf", new_pdf_bytes)
outputs["original_pdf"] = str(local_md_dir / f"{file_name}_origin.pdf")
if request.dump_md:
image_dir = str(os.path.basename(local_image_dir))
md_content_str = pipeline_union_make(pdf_info, request.make_md_mode, image_dir)
md_writer.write_string(f"{file_name}.md", md_content_str)
outputs["markdown"] = str(local_md_dir / f"{file_name}.md")
if request.dump_content_list:
image_dir = str(os.path.basename(local_image_dir))
content_list = pipeline_union_make(pdf_info, MakeMode.CONTENT_LIST, image_dir)
md_writer.write_string(
f"{file_name}_content_list.json",
json.dumps(content_list, ensure_ascii=False, indent=4)
)
outputs["content_list"] = str(local_md_dir / f"{file_name}_content_list.json")
if request.dump_middle_json:
md_writer.write_string(
f"{file_name}_middle.json",
json.dumps(middle_json, ensure_ascii=False, indent=4)
)
outputs["middle_json"] = str(local_md_dir / f"{file_name}_middle.json")
if request.dump_model_output:
md_writer.write_string(
f"{file_name}_model.json",
json.dumps(model_list, ensure_ascii=False, indent=4)
)
outputs["model_output"] = str(local_md_dir / f"{file_name}_model.json")
else:
# VLM backend
if request.backend.startswith("vlm-"):
backend = request.backend[4:]
middle_json, infer_result = vlm_doc_analyze(
content, image_writer=image_writer,
backend=backend, server_url=request.server_url
)
pdf_info = middle_json["pdf_info"]
# Generate outputs for VLM
outputs = {}
if request.draw_layout_bbox:
draw_layout_bbox(pdf_info, content, local_md_dir, f"{file_name}_layout.pdf")
outputs["layout_pdf"] = str(local_md_dir / f"{file_name}_layout.pdf")
if request.dump_orig_pdf:
md_writer.write(f"{file_name}_origin.pdf", content)
outputs["original_pdf"] = str(local_md_dir / f"{file_name}_origin.pdf")
if request.dump_md:
image_dir = str(os.path.basename(local_image_dir))
md_content_str = vlm_union_make(pdf_info, request.make_md_mode, image_dir)
md_writer.write_string(f"{file_name}.md", md_content_str)
outputs["markdown"] = str(local_md_dir / f"{file_name}.md")
if request.dump_content_list:
image_dir = str(os.path.basename(local_image_dir))
content_list = vlm_union_make(pdf_info, MakeMode.CONTENT_LIST, image_dir)
md_writer.write_string(
f"{file_name}_content_list.json",
json.dumps(content_list, ensure_ascii=False, indent=4)
)
outputs["content_list"] = str(local_md_dir / f"{file_name}_content_list.json")
if request.dump_middle_json:
md_writer.write_string(
f"{file_name}_middle.json",
json.dumps(middle_json, ensure_ascii=False, indent=4)
)
outputs["middle_json"] = str(local_md_dir / f"{file_name}_middle.json")
if request.dump_model_output:
model_output = ("\n" + "-" * 50 + "\n").join(infer_result)
md_writer.write_string(f"{file_name}_model_output.txt", model_output)
outputs["model_output"] = str(local_md_dir / f"{file_name}_model_output.txt")
return {
"status": "success",
"file_name": file_name,
"outputs": outputs,
"output_directory": str(local_md_dir)
}
finally:
# Clean up temporary file
if temp_file_path.exists():
temp_file_path.unlink()
except Exception as e:
logger.exception(f"Error processing document: {e}")
raise HTTPException(status_code=500, detail=f"Error processing document: {str(e)}")
@router.post("/parse")
async def parse_document(
file: UploadFile = File(...),
lang: str = "ch",
backend: str = "pipeline",
method: str = "auto",
server_url: Optional[str] = None,
start_page_id: int = 0,
end_page_id: Optional[int] = None,
formula_enable: bool = True,
table_enable: bool = True,
draw_layout_bbox: bool = True,
draw_span_bbox: bool = True,
dump_md: bool = True,
dump_middle_json: bool = True,
dump_model_output: bool = True,
dump_orig_pdf: bool = True,
dump_content_list: bool = True,
make_md_mode: str = "MM_MD"
):
"""
Parse a document using Mineru API
Parameters:
- file: The document file to parse (PDF, image, etc.)
- lang: Language option (default: 'ch')
- backend: Backend for parsing ('pipeline', 'vlm-transformers', 'vlm-sglang-engine', 'vlm-sglang-client')
- method: Method for parsing ('auto', 'txt', 'ocr')
- server_url: Server URL for vlm-sglang-client backend
- start_page_id: Start page ID for parsing
- end_page_id: End page ID for parsing
- formula_enable: Enable formula parsing
- table_enable: Enable table parsing
- draw_layout_bbox: Whether to draw layout bounding boxes
- draw_span_bbox: Whether to draw span bounding boxes
- dump_md: Whether to dump markdown files
- dump_middle_json: Whether to dump middle JSON files
- dump_model_output: Whether to dump model output files
- dump_orig_pdf: Whether to dump original PDF files
- dump_content_list: Whether to dump content list files
- make_md_mode: The mode for making markdown content
"""
# Validate file type
allowed_extensions = {".pdf", ".png", ".jpeg", ".jpg"}
file_extension = Path(file.filename).suffix.lower()
if file_extension not in allowed_extensions:
raise HTTPException(
status_code=400,
detail=f"File type not allowed. Allowed types: {', '.join(allowed_extensions)}"
)
# Create request object
request = MineruParseRequest(
lang=lang,
backend=backend,
method=method,
server_url=server_url,
start_page_id=start_page_id,
end_page_id=end_page_id,
formula_enable=formula_enable,
table_enable=table_enable,
draw_layout_bbox=draw_layout_bbox,
draw_span_bbox=draw_span_bbox,
dump_md=dump_md,
dump_middle_json=dump_middle_json,
dump_model_output=dump_model_output,
dump_orig_pdf=dump_orig_pdf,
dump_content_list=dump_content_list,
make_md_mode=make_md_mode
)
# Create output directory
output_dir = settings.PROCESSED_FOLDER / "mineru" / str(uuid.uuid4())
output_dir.mkdir(parents=True, exist_ok=True)
# Process document
result = await process_mineru_document(file, request, output_dir)
return result
@router.get("/download/{file_path:path}")
async def download_processed_file(file_path: str):
"""Download a processed file from the mineru output directory"""
try:
# Construct the full path
full_path = settings.PROCESSED_FOLDER / "mineru" / file_path
# Security check: ensure the path is within the processed folder
if not str(full_path).startswith(str(settings.PROCESSED_FOLDER)):
raise HTTPException(status_code=400, detail="Invalid file path")
if not full_path.exists():
raise HTTPException(status_code=404, detail="File not found")
return FileResponse(
path=str(full_path),
filename=full_path.name,
media_type="application/octet-stream"
)
except Exception as e:
raise HTTPException(status_code=500, detail=f"Error downloading file: {str(e)}")
@router.get("/health")
async def health_check():
"""Health check endpoint for mineru service"""
return {"status": "healthy", "service": "mineru"}