legal-doc-masker/magicdoc/app/main.py

97 lines
3.2 KiB
Python

import os
import logging
from typing import Dict, Any, Optional
from fastapi import FastAPI, File, UploadFile, Form, HTTPException
from fastapi.responses import JSONResponse
from magic_doc.docconv import DocConverter, S3Config
import tempfile
import shutil
# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
app = FastAPI(title="MagicDoc API", version="1.0.0")
# Global converter instance
converter = DocConverter(s3_config=None)
@app.get("/health")
async def health_check():
"""Health check endpoint"""
return {"status": "healthy", "service": "magicdoc-api"}
@app.post("/file_parse")
async def parse_file(
files: UploadFile = File(...),
output_dir: str = Form("./output"),
lang_list: str = Form("ch"),
backend: str = Form("pipeline"),
parse_method: str = Form("auto"),
formula_enable: bool = Form(True),
table_enable: bool = Form(True),
return_md: bool = Form(True),
return_middle_json: bool = Form(False),
return_model_output: bool = Form(False),
return_content_list: bool = Form(False),
return_images: bool = Form(False),
start_page_id: int = Form(0),
end_page_id: int = Form(99999)
):
"""
Parse document file and convert to markdown
Compatible with Mineru API interface
"""
try:
logger.info(f"Processing file: {files.filename}")
# Create temporary file to save uploaded content
with tempfile.NamedTemporaryFile(delete=False, suffix=os.path.splitext(files.filename)[1]) as temp_file:
shutil.copyfileobj(files.file, temp_file)
temp_file_path = temp_file.name
try:
# Convert file to markdown using magic-doc
markdown_content, time_cost = converter.convert(temp_file_path, conv_timeout=300)
logger.info(f"Successfully converted {files.filename} to markdown in {time_cost:.2f}s")
# Return response compatible with Mineru API
response = {
"markdown": markdown_content,
"md": markdown_content, # Alternative field name
"content": markdown_content, # Alternative field name
"text": markdown_content, # Alternative field name
"time_cost": time_cost,
"filename": files.filename,
"status": "success"
}
return JSONResponse(content=response)
finally:
# Clean up temporary file
if os.path.exists(temp_file_path):
os.unlink(temp_file_path)
except Exception as e:
logger.error(f"Error processing file {files.filename}: {str(e)}")
raise HTTPException(status_code=500, detail=f"Error processing file: {str(e)}")
@app.get("/")
async def root():
"""Root endpoint with service information"""
return {
"service": "MagicDoc API",
"version": "1.0.0",
"description": "Document to Markdown conversion service using Magic-Doc",
"endpoints": {
"health": "/health",
"file_parse": "/file_parse"
}
}
if __name__ == "__main__":
import uvicorn
uvicorn.run(app, host="0.0.0.0", port=8000)