330 lines
14 KiB
Python
330 lines
14 KiB
Python
from fastapi import APIRouter, HTTPException, UploadFile, File, BackgroundTasks
|
|
from fastapi.responses import FileResponse
|
|
from typing import List, Optional
|
|
import os
|
|
import tempfile
|
|
import shutil
|
|
import json
|
|
from pathlib import Path
|
|
import uuid
|
|
from loguru import logger
|
|
|
|
from ...core.config import settings
|
|
|
|
# Import mineru functions
|
|
from mineru.cli.common import convert_pdf_bytes_to_bytes_by_pypdfium2, prepare_env, read_fn
|
|
from mineru.data.data_reader_writer import FileBasedDataWriter
|
|
from mineru.utils.draw_bbox import draw_layout_bbox, draw_span_bbox
|
|
from mineru.utils.enum_class import MakeMode
|
|
from mineru.backend.vlm.vlm_analyze import doc_analyze as vlm_doc_analyze
|
|
from mineru.backend.pipeline.pipeline_analyze import doc_analyze as pipeline_doc_analyze
|
|
from mineru.backend.pipeline.pipeline_middle_json_mkcontent import union_make as pipeline_union_make
|
|
from mineru.backend.pipeline.model_json_to_middle_json import result_to_middle_json as pipeline_result_to_middle_json
|
|
from mineru.backend.vlm.vlm_middle_json_mkcontent import union_make as vlm_union_make
|
|
|
|
router = APIRouter()
|
|
|
|
class MineruParseRequest:
|
|
def __init__(
|
|
self,
|
|
lang: str = "ch",
|
|
backend: str = "pipeline",
|
|
method: str = "auto",
|
|
server_url: Optional[str] = None,
|
|
start_page_id: int = 0,
|
|
end_page_id: Optional[int] = None,
|
|
formula_enable: bool = True,
|
|
table_enable: bool = True,
|
|
draw_layout_bbox: bool = True,
|
|
draw_span_bbox: bool = True,
|
|
dump_md: bool = True,
|
|
dump_middle_json: bool = True,
|
|
dump_model_output: bool = True,
|
|
dump_orig_pdf: bool = True,
|
|
dump_content_list: bool = True,
|
|
make_md_mode: str = "MM_MD"
|
|
):
|
|
self.lang = lang
|
|
self.backend = backend
|
|
self.method = method
|
|
self.server_url = server_url
|
|
self.start_page_id = start_page_id
|
|
self.end_page_id = end_page_id
|
|
self.formula_enable = formula_enable
|
|
self.table_enable = table_enable
|
|
self.draw_layout_bbox = draw_layout_bbox
|
|
self.draw_span_bbox = draw_span_bbox
|
|
self.dump_md = dump_md
|
|
self.dump_middle_json = dump_middle_json
|
|
self.dump_model_output = dump_model_output
|
|
self.dump_orig_pdf = dump_orig_pdf
|
|
self.dump_content_list = dump_content_list
|
|
self.make_md_mode = MakeMode.MM_MD if make_md_mode == "MM_MD" else MakeMode.CONTENT_LIST
|
|
|
|
async def process_mineru_document(
|
|
file: UploadFile,
|
|
request: MineruParseRequest,
|
|
output_dir: Path
|
|
) -> dict:
|
|
"""Process a single document using Mineru"""
|
|
try:
|
|
# Read file content
|
|
content = await file.read()
|
|
|
|
# Create temporary file
|
|
with tempfile.NamedTemporaryFile(delete=False, suffix=Path(file.filename).suffix) as temp_file:
|
|
temp_file.write(content)
|
|
temp_file_path = Path(temp_file.name)
|
|
|
|
try:
|
|
# Prepare environment
|
|
file_name = Path(file.filename).stem
|
|
local_image_dir, local_md_dir = prepare_env(output_dir, file_name, request.method)
|
|
image_writer, md_writer = FileBasedDataWriter(local_image_dir), FileBasedDataWriter(local_md_dir)
|
|
|
|
# Convert PDF bytes if needed
|
|
if request.backend == "pipeline":
|
|
new_pdf_bytes = convert_pdf_bytes_to_bytes_by_pypdfium2(
|
|
content, request.start_page_id, request.end_page_id
|
|
)
|
|
|
|
# Analyze document
|
|
infer_results, all_image_lists, all_pdf_docs, lang_list, ocr_enabled_list = pipeline_doc_analyze(
|
|
[new_pdf_bytes], [request.lang],
|
|
parse_method=request.method,
|
|
formula_enable=request.formula_enable,
|
|
table_enable=request.table_enable
|
|
)
|
|
|
|
# Process results
|
|
model_list = infer_results[0]
|
|
images_list = all_image_lists[0]
|
|
pdf_doc = all_pdf_docs[0]
|
|
_lang = lang_list[0]
|
|
_ocr_enable = ocr_enabled_list[0]
|
|
|
|
middle_json = pipeline_result_to_middle_json(
|
|
model_list, images_list, pdf_doc, image_writer, _lang, _ocr_enable, request.formula_enable
|
|
)
|
|
|
|
pdf_info = middle_json["pdf_info"]
|
|
|
|
# Generate outputs
|
|
outputs = {}
|
|
|
|
if request.draw_layout_bbox:
|
|
draw_layout_bbox(pdf_info, new_pdf_bytes, local_md_dir, f"{file_name}_layout.pdf")
|
|
outputs["layout_pdf"] = str(local_md_dir / f"{file_name}_layout.pdf")
|
|
|
|
if request.draw_span_bbox:
|
|
draw_span_bbox(pdf_info, new_pdf_bytes, local_md_dir, f"{file_name}_span.pdf")
|
|
outputs["span_pdf"] = str(local_md_dir / f"{file_name}_span.pdf")
|
|
|
|
if request.dump_orig_pdf:
|
|
md_writer.write(f"{file_name}_origin.pdf", new_pdf_bytes)
|
|
outputs["original_pdf"] = str(local_md_dir / f"{file_name}_origin.pdf")
|
|
|
|
if request.dump_md:
|
|
image_dir = str(os.path.basename(local_image_dir))
|
|
md_content_str = pipeline_union_make(pdf_info, request.make_md_mode, image_dir)
|
|
md_writer.write_string(f"{file_name}.md", md_content_str)
|
|
outputs["markdown"] = str(local_md_dir / f"{file_name}.md")
|
|
|
|
if request.dump_content_list:
|
|
image_dir = str(os.path.basename(local_image_dir))
|
|
content_list = pipeline_union_make(pdf_info, MakeMode.CONTENT_LIST, image_dir)
|
|
md_writer.write_string(
|
|
f"{file_name}_content_list.json",
|
|
json.dumps(content_list, ensure_ascii=False, indent=4)
|
|
)
|
|
outputs["content_list"] = str(local_md_dir / f"{file_name}_content_list.json")
|
|
|
|
if request.dump_middle_json:
|
|
md_writer.write_string(
|
|
f"{file_name}_middle.json",
|
|
json.dumps(middle_json, ensure_ascii=False, indent=4)
|
|
)
|
|
outputs["middle_json"] = str(local_md_dir / f"{file_name}_middle.json")
|
|
|
|
if request.dump_model_output:
|
|
md_writer.write_string(
|
|
f"{file_name}_model.json",
|
|
json.dumps(model_list, ensure_ascii=False, indent=4)
|
|
)
|
|
outputs["model_output"] = str(local_md_dir / f"{file_name}_model.json")
|
|
|
|
else:
|
|
# VLM backend
|
|
if request.backend.startswith("vlm-"):
|
|
backend = request.backend[4:]
|
|
|
|
middle_json, infer_result = vlm_doc_analyze(
|
|
content, image_writer=image_writer,
|
|
backend=backend, server_url=request.server_url
|
|
)
|
|
|
|
pdf_info = middle_json["pdf_info"]
|
|
|
|
# Generate outputs for VLM
|
|
outputs = {}
|
|
|
|
if request.draw_layout_bbox:
|
|
draw_layout_bbox(pdf_info, content, local_md_dir, f"{file_name}_layout.pdf")
|
|
outputs["layout_pdf"] = str(local_md_dir / f"{file_name}_layout.pdf")
|
|
|
|
if request.dump_orig_pdf:
|
|
md_writer.write(f"{file_name}_origin.pdf", content)
|
|
outputs["original_pdf"] = str(local_md_dir / f"{file_name}_origin.pdf")
|
|
|
|
if request.dump_md:
|
|
image_dir = str(os.path.basename(local_image_dir))
|
|
md_content_str = vlm_union_make(pdf_info, request.make_md_mode, image_dir)
|
|
md_writer.write_string(f"{file_name}.md", md_content_str)
|
|
outputs["markdown"] = str(local_md_dir / f"{file_name}.md")
|
|
|
|
if request.dump_content_list:
|
|
image_dir = str(os.path.basename(local_image_dir))
|
|
content_list = vlm_union_make(pdf_info, MakeMode.CONTENT_LIST, image_dir)
|
|
md_writer.write_string(
|
|
f"{file_name}_content_list.json",
|
|
json.dumps(content_list, ensure_ascii=False, indent=4)
|
|
)
|
|
outputs["content_list"] = str(local_md_dir / f"{file_name}_content_list.json")
|
|
|
|
if request.dump_middle_json:
|
|
md_writer.write_string(
|
|
f"{file_name}_middle.json",
|
|
json.dumps(middle_json, ensure_ascii=False, indent=4)
|
|
)
|
|
outputs["middle_json"] = str(local_md_dir / f"{file_name}_middle.json")
|
|
|
|
if request.dump_model_output:
|
|
model_output = ("\n" + "-" * 50 + "\n").join(infer_result)
|
|
md_writer.write_string(f"{file_name}_model_output.txt", model_output)
|
|
outputs["model_output"] = str(local_md_dir / f"{file_name}_model_output.txt")
|
|
|
|
return {
|
|
"status": "success",
|
|
"file_name": file_name,
|
|
"outputs": outputs,
|
|
"output_directory": str(local_md_dir)
|
|
}
|
|
|
|
finally:
|
|
# Clean up temporary file
|
|
if temp_file_path.exists():
|
|
temp_file_path.unlink()
|
|
|
|
except Exception as e:
|
|
logger.exception(f"Error processing document: {e}")
|
|
raise HTTPException(status_code=500, detail=f"Error processing document: {str(e)}")
|
|
|
|
@router.post("/parse")
|
|
async def parse_document(
|
|
file: UploadFile = File(...),
|
|
lang: str = "ch",
|
|
backend: str = "pipeline",
|
|
method: str = "auto",
|
|
server_url: Optional[str] = None,
|
|
start_page_id: int = 0,
|
|
end_page_id: Optional[int] = None,
|
|
formula_enable: bool = True,
|
|
table_enable: bool = True,
|
|
draw_layout_bbox: bool = True,
|
|
draw_span_bbox: bool = True,
|
|
dump_md: bool = True,
|
|
dump_middle_json: bool = True,
|
|
dump_model_output: bool = True,
|
|
dump_orig_pdf: bool = True,
|
|
dump_content_list: bool = True,
|
|
make_md_mode: str = "MM_MD"
|
|
):
|
|
"""
|
|
Parse a document using Mineru API
|
|
|
|
Parameters:
|
|
- file: The document file to parse (PDF, image, etc.)
|
|
- lang: Language option (default: 'ch')
|
|
- backend: Backend for parsing ('pipeline', 'vlm-transformers', 'vlm-sglang-engine', 'vlm-sglang-client')
|
|
- method: Method for parsing ('auto', 'txt', 'ocr')
|
|
- server_url: Server URL for vlm-sglang-client backend
|
|
- start_page_id: Start page ID for parsing
|
|
- end_page_id: End page ID for parsing
|
|
- formula_enable: Enable formula parsing
|
|
- table_enable: Enable table parsing
|
|
- draw_layout_bbox: Whether to draw layout bounding boxes
|
|
- draw_span_bbox: Whether to draw span bounding boxes
|
|
- dump_md: Whether to dump markdown files
|
|
- dump_middle_json: Whether to dump middle JSON files
|
|
- dump_model_output: Whether to dump model output files
|
|
- dump_orig_pdf: Whether to dump original PDF files
|
|
- dump_content_list: Whether to dump content list files
|
|
- make_md_mode: The mode for making markdown content
|
|
"""
|
|
|
|
# Validate file type
|
|
allowed_extensions = {".pdf", ".png", ".jpeg", ".jpg"}
|
|
file_extension = Path(file.filename).suffix.lower()
|
|
if file_extension not in allowed_extensions:
|
|
raise HTTPException(
|
|
status_code=400,
|
|
detail=f"File type not allowed. Allowed types: {', '.join(allowed_extensions)}"
|
|
)
|
|
|
|
# Create request object
|
|
request = MineruParseRequest(
|
|
lang=lang,
|
|
backend=backend,
|
|
method=method,
|
|
server_url=server_url,
|
|
start_page_id=start_page_id,
|
|
end_page_id=end_page_id,
|
|
formula_enable=formula_enable,
|
|
table_enable=table_enable,
|
|
draw_layout_bbox=draw_layout_bbox,
|
|
draw_span_bbox=draw_span_bbox,
|
|
dump_md=dump_md,
|
|
dump_middle_json=dump_middle_json,
|
|
dump_model_output=dump_model_output,
|
|
dump_orig_pdf=dump_orig_pdf,
|
|
dump_content_list=dump_content_list,
|
|
make_md_mode=make_md_mode
|
|
)
|
|
|
|
# Create output directory
|
|
output_dir = settings.PROCESSED_FOLDER / "mineru" / str(uuid.uuid4())
|
|
output_dir.mkdir(parents=True, exist_ok=True)
|
|
|
|
# Process document
|
|
result = await process_mineru_document(file, request, output_dir)
|
|
|
|
return result
|
|
|
|
@router.get("/download/{file_path:path}")
|
|
async def download_processed_file(file_path: str):
|
|
"""Download a processed file from the mineru output directory"""
|
|
try:
|
|
# Construct the full path
|
|
full_path = settings.PROCESSED_FOLDER / "mineru" / file_path
|
|
|
|
# Security check: ensure the path is within the processed folder
|
|
if not str(full_path).startswith(str(settings.PROCESSED_FOLDER)):
|
|
raise HTTPException(status_code=400, detail="Invalid file path")
|
|
|
|
if not full_path.exists():
|
|
raise HTTPException(status_code=404, detail="File not found")
|
|
|
|
return FileResponse(
|
|
path=str(full_path),
|
|
filename=full_path.name,
|
|
media_type="application/octet-stream"
|
|
)
|
|
|
|
except Exception as e:
|
|
raise HTTPException(status_code=500, detail=f"Error downloading file: {str(e)}")
|
|
|
|
@router.get("/health")
|
|
async def health_check():
|
|
"""Health check endpoint for mineru service"""
|
|
return {"status": "healthy", "service": "mineru"}
|