from fastapi import APIRouter, HTTPException, UploadFile, File, BackgroundTasks from fastapi.responses import FileResponse from typing import List, Optional import os import tempfile import shutil import json from pathlib import Path import uuid from loguru import logger from ...core.config import settings # Import mineru functions from mineru.cli.common import convert_pdf_bytes_to_bytes_by_pypdfium2, prepare_env, read_fn from mineru.data.data_reader_writer import FileBasedDataWriter from mineru.utils.draw_bbox import draw_layout_bbox, draw_span_bbox from mineru.utils.enum_class import MakeMode from mineru.backend.vlm.vlm_analyze import doc_analyze as vlm_doc_analyze from mineru.backend.pipeline.pipeline_analyze import doc_analyze as pipeline_doc_analyze from mineru.backend.pipeline.pipeline_middle_json_mkcontent import union_make as pipeline_union_make from mineru.backend.pipeline.model_json_to_middle_json import result_to_middle_json as pipeline_result_to_middle_json from mineru.backend.vlm.vlm_middle_json_mkcontent import union_make as vlm_union_make router = APIRouter() class MineruParseRequest: def __init__( self, lang: str = "ch", backend: str = "pipeline", method: str = "auto", server_url: Optional[str] = None, start_page_id: int = 0, end_page_id: Optional[int] = None, formula_enable: bool = True, table_enable: bool = True, draw_layout_bbox: bool = True, draw_span_bbox: bool = True, dump_md: bool = True, dump_middle_json: bool = True, dump_model_output: bool = True, dump_orig_pdf: bool = True, dump_content_list: bool = True, make_md_mode: str = "MM_MD" ): self.lang = lang self.backend = backend self.method = method self.server_url = server_url self.start_page_id = start_page_id self.end_page_id = end_page_id self.formula_enable = formula_enable self.table_enable = table_enable self.draw_layout_bbox = draw_layout_bbox self.draw_span_bbox = draw_span_bbox self.dump_md = dump_md self.dump_middle_json = dump_middle_json self.dump_model_output = dump_model_output self.dump_orig_pdf = dump_orig_pdf self.dump_content_list = dump_content_list self.make_md_mode = MakeMode.MM_MD if make_md_mode == "MM_MD" else MakeMode.CONTENT_LIST async def process_mineru_document( file: UploadFile, request: MineruParseRequest, output_dir: Path ) -> dict: """Process a single document using Mineru""" try: # Read file content content = await file.read() # Create temporary file with tempfile.NamedTemporaryFile(delete=False, suffix=Path(file.filename).suffix) as temp_file: temp_file.write(content) temp_file_path = Path(temp_file.name) try: # Prepare environment file_name = Path(file.filename).stem local_image_dir, local_md_dir = prepare_env(output_dir, file_name, request.method) image_writer, md_writer = FileBasedDataWriter(local_image_dir), FileBasedDataWriter(local_md_dir) # Convert PDF bytes if needed if request.backend == "pipeline": new_pdf_bytes = convert_pdf_bytes_to_bytes_by_pypdfium2( content, request.start_page_id, request.end_page_id ) # Analyze document infer_results, all_image_lists, all_pdf_docs, lang_list, ocr_enabled_list = pipeline_doc_analyze( [new_pdf_bytes], [request.lang], parse_method=request.method, formula_enable=request.formula_enable, table_enable=request.table_enable ) # Process results model_list = infer_results[0] images_list = all_image_lists[0] pdf_doc = all_pdf_docs[0] _lang = lang_list[0] _ocr_enable = ocr_enabled_list[0] middle_json = pipeline_result_to_middle_json( model_list, images_list, pdf_doc, image_writer, _lang, _ocr_enable, request.formula_enable ) pdf_info = middle_json["pdf_info"] # Generate outputs outputs = {} if request.draw_layout_bbox: draw_layout_bbox(pdf_info, new_pdf_bytes, local_md_dir, f"{file_name}_layout.pdf") outputs["layout_pdf"] = str(local_md_dir / f"{file_name}_layout.pdf") if request.draw_span_bbox: draw_span_bbox(pdf_info, new_pdf_bytes, local_md_dir, f"{file_name}_span.pdf") outputs["span_pdf"] = str(local_md_dir / f"{file_name}_span.pdf") if request.dump_orig_pdf: md_writer.write(f"{file_name}_origin.pdf", new_pdf_bytes) outputs["original_pdf"] = str(local_md_dir / f"{file_name}_origin.pdf") if request.dump_md: image_dir = str(os.path.basename(local_image_dir)) md_content_str = pipeline_union_make(pdf_info, request.make_md_mode, image_dir) md_writer.write_string(f"{file_name}.md", md_content_str) outputs["markdown"] = str(local_md_dir / f"{file_name}.md") if request.dump_content_list: image_dir = str(os.path.basename(local_image_dir)) content_list = pipeline_union_make(pdf_info, MakeMode.CONTENT_LIST, image_dir) md_writer.write_string( f"{file_name}_content_list.json", json.dumps(content_list, ensure_ascii=False, indent=4) ) outputs["content_list"] = str(local_md_dir / f"{file_name}_content_list.json") if request.dump_middle_json: md_writer.write_string( f"{file_name}_middle.json", json.dumps(middle_json, ensure_ascii=False, indent=4) ) outputs["middle_json"] = str(local_md_dir / f"{file_name}_middle.json") if request.dump_model_output: md_writer.write_string( f"{file_name}_model.json", json.dumps(model_list, ensure_ascii=False, indent=4) ) outputs["model_output"] = str(local_md_dir / f"{file_name}_model.json") else: # VLM backend if request.backend.startswith("vlm-"): backend = request.backend[4:] middle_json, infer_result = vlm_doc_analyze( content, image_writer=image_writer, backend=backend, server_url=request.server_url ) pdf_info = middle_json["pdf_info"] # Generate outputs for VLM outputs = {} if request.draw_layout_bbox: draw_layout_bbox(pdf_info, content, local_md_dir, f"{file_name}_layout.pdf") outputs["layout_pdf"] = str(local_md_dir / f"{file_name}_layout.pdf") if request.dump_orig_pdf: md_writer.write(f"{file_name}_origin.pdf", content) outputs["original_pdf"] = str(local_md_dir / f"{file_name}_origin.pdf") if request.dump_md: image_dir = str(os.path.basename(local_image_dir)) md_content_str = vlm_union_make(pdf_info, request.make_md_mode, image_dir) md_writer.write_string(f"{file_name}.md", md_content_str) outputs["markdown"] = str(local_md_dir / f"{file_name}.md") if request.dump_content_list: image_dir = str(os.path.basename(local_image_dir)) content_list = vlm_union_make(pdf_info, MakeMode.CONTENT_LIST, image_dir) md_writer.write_string( f"{file_name}_content_list.json", json.dumps(content_list, ensure_ascii=False, indent=4) ) outputs["content_list"] = str(local_md_dir / f"{file_name}_content_list.json") if request.dump_middle_json: md_writer.write_string( f"{file_name}_middle.json", json.dumps(middle_json, ensure_ascii=False, indent=4) ) outputs["middle_json"] = str(local_md_dir / f"{file_name}_middle.json") if request.dump_model_output: model_output = ("\n" + "-" * 50 + "\n").join(infer_result) md_writer.write_string(f"{file_name}_model_output.txt", model_output) outputs["model_output"] = str(local_md_dir / f"{file_name}_model_output.txt") return { "status": "success", "file_name": file_name, "outputs": outputs, "output_directory": str(local_md_dir) } finally: # Clean up temporary file if temp_file_path.exists(): temp_file_path.unlink() except Exception as e: logger.exception(f"Error processing document: {e}") raise HTTPException(status_code=500, detail=f"Error processing document: {str(e)}") @router.post("/parse") async def parse_document( file: UploadFile = File(...), lang: str = "ch", backend: str = "pipeline", method: str = "auto", server_url: Optional[str] = None, start_page_id: int = 0, end_page_id: Optional[int] = None, formula_enable: bool = True, table_enable: bool = True, draw_layout_bbox: bool = True, draw_span_bbox: bool = True, dump_md: bool = True, dump_middle_json: bool = True, dump_model_output: bool = True, dump_orig_pdf: bool = True, dump_content_list: bool = True, make_md_mode: str = "MM_MD" ): """ Parse a document using Mineru API Parameters: - file: The document file to parse (PDF, image, etc.) - lang: Language option (default: 'ch') - backend: Backend for parsing ('pipeline', 'vlm-transformers', 'vlm-sglang-engine', 'vlm-sglang-client') - method: Method for parsing ('auto', 'txt', 'ocr') - server_url: Server URL for vlm-sglang-client backend - start_page_id: Start page ID for parsing - end_page_id: End page ID for parsing - formula_enable: Enable formula parsing - table_enable: Enable table parsing - draw_layout_bbox: Whether to draw layout bounding boxes - draw_span_bbox: Whether to draw span bounding boxes - dump_md: Whether to dump markdown files - dump_middle_json: Whether to dump middle JSON files - dump_model_output: Whether to dump model output files - dump_orig_pdf: Whether to dump original PDF files - dump_content_list: Whether to dump content list files - make_md_mode: The mode for making markdown content """ # Validate file type allowed_extensions = {".pdf", ".png", ".jpeg", ".jpg"} file_extension = Path(file.filename).suffix.lower() if file_extension not in allowed_extensions: raise HTTPException( status_code=400, detail=f"File type not allowed. Allowed types: {', '.join(allowed_extensions)}" ) # Create request object request = MineruParseRequest( lang=lang, backend=backend, method=method, server_url=server_url, start_page_id=start_page_id, end_page_id=end_page_id, formula_enable=formula_enable, table_enable=table_enable, draw_layout_bbox=draw_layout_bbox, draw_span_bbox=draw_span_bbox, dump_md=dump_md, dump_middle_json=dump_middle_json, dump_model_output=dump_model_output, dump_orig_pdf=dump_orig_pdf, dump_content_list=dump_content_list, make_md_mode=make_md_mode ) # Create output directory output_dir = settings.PROCESSED_FOLDER / "mineru" / str(uuid.uuid4()) output_dir.mkdir(parents=True, exist_ok=True) # Process document result = await process_mineru_document(file, request, output_dir) return result @router.get("/download/{file_path:path}") async def download_processed_file(file_path: str): """Download a processed file from the mineru output directory""" try: # Construct the full path full_path = settings.PROCESSED_FOLDER / "mineru" / file_path # Security check: ensure the path is within the processed folder if not str(full_path).startswith(str(settings.PROCESSED_FOLDER)): raise HTTPException(status_code=400, detail="Invalid file path") if not full_path.exists(): raise HTTPException(status_code=404, detail="File not found") return FileResponse( path=str(full_path), filename=full_path.name, media_type="application/octet-stream" ) except Exception as e: raise HTTPException(status_code=500, detail=f"Error downloading file: {str(e)}") @router.get("/health") async def health_check(): """Health check endpoint for mineru service""" return {"status": "healthy", "service": "mineru"}